mirror of
https://github.com/Eaglercraft-Archive/Eaglercraftx-1.8.8-src.git
synced 2025-06-28 02:48:14 -05:00
Update #48 - Added some features from OptiFine
This commit is contained in:
4729
sources/main/java/jdk_internal/icu/text/BidiBase.java
Normal file
4729
sources/main/java/jdk_internal/icu/text/BidiBase.java
Normal file
File diff suppressed because it is too large
Load Diff
821
sources/main/java/jdk_internal/icu/text/BidiLine.java
Normal file
821
sources/main/java/jdk_internal/icu/text/BidiLine.java
Normal file
@ -0,0 +1,821 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import jdk_internal.bidi.Bidi;
|
||||
|
||||
final class BidiLine {
|
||||
|
||||
/*
|
||||
* General remarks about the functions in this file:
|
||||
*
|
||||
* These functions deal with the aspects of potentially mixed-directional text
|
||||
* in a single paragraph or in a line of a single paragraph which has already
|
||||
* been processed according to the Unicode 3.0 Bidi algorithm as defined in <a
|
||||
* href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9: Unicode
|
||||
* Bidirectional Algorithm</a>, version 13, also described in The Unicode
|
||||
* Standard, Version 4.0.1 .
|
||||
*
|
||||
* This means that there is a Bidi object with a levels and a dirProps array.
|
||||
* paraLevel and direction are also set. Only if the length of the text is zero,
|
||||
* then levels==dirProps==NULL.
|
||||
*
|
||||
* The overall directionality of the paragraph or line is used to bypass the
|
||||
* reordering steps if possible. Even purely RTL text does not need reordering
|
||||
* there because the getLogical/VisualIndex() methods can compute the index on
|
||||
* the fly in such a case.
|
||||
*
|
||||
* The implementation of the access to same-level-runs and of the reordering do
|
||||
* attempt to provide better performance and less memory usage compared to a
|
||||
* direct implementation of especially rule (L2) with an array of one (32-bit)
|
||||
* integer per text character.
|
||||
*
|
||||
* Here, the levels array is scanned as soon as necessary, and a vector of
|
||||
* same-level-runs is created. Reordering then is done on this vector. For each
|
||||
* run of text positions that were resolved to the same level, only 8 bytes are
|
||||
* stored: the first text position of the run and the visual position behind the
|
||||
* run after reordering. One sign bit is used to hold the directionality of the
|
||||
* run. This is inefficient if there are many very short runs. If the average
|
||||
* run length is <2, then this uses more memory.
|
||||
*
|
||||
* In a further attempt to save memory, the levels array is never changed after
|
||||
* all the resolution rules (Xn, Wn, Nn, In). Many methods have to consider the
|
||||
* field trailingWSStart: if it is less than length, then there is an implicit
|
||||
* trailing run at the paraLevel, which is not reflected in the levels array.
|
||||
* This allows a line Bidi object to use the same levels array as its paragraph
|
||||
* parent object.
|
||||
*
|
||||
* When a Bidi object is created for a line of a paragraph, then the paragraph's
|
||||
* levels and dirProps arrays are reused by way of setting a pointer into them,
|
||||
* not by copying. This again saves memory and forbids to change the now shared
|
||||
* levels for (L1).
|
||||
*/
|
||||
|
||||
/* handle trailing WS (L1) -------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* setTrailingWSStart() sets the start index for a trailing run of WS in the
|
||||
* line. This is necessary because we do not modify the paragraph's levels array
|
||||
* that we just point into. Using trailingWSStart is another form of performing
|
||||
* (L1).
|
||||
*
|
||||
* To make subsequent operations easier, we also include the run before the WS
|
||||
* if it is at the paraLevel - we merge the two here.
|
||||
*
|
||||
* This method is called only from setLine(), so paraLevel is set correctly for
|
||||
* the line even when contextual multiple paragraphs.
|
||||
*/
|
||||
|
||||
static void setTrailingWSStart(BidiBase bidiBase) {
|
||||
byte[] dirProps = bidiBase.dirProps;
|
||||
byte[] levels = bidiBase.levels;
|
||||
int start = bidiBase.length;
|
||||
byte paraLevel = bidiBase.paraLevel;
|
||||
|
||||
/*
|
||||
* If the line is terminated by a block separator, all preceding WS etc... are
|
||||
* already set to paragraph level. Setting trailingWSStart to pBidi->length will
|
||||
* avoid changing the level of B chars from 0 to paraLevel in getLevels when
|
||||
* orderParagraphsLTR==TRUE
|
||||
*/
|
||||
if (dirProps[start - 1] == BidiBase.B) {
|
||||
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
|
||||
return;
|
||||
}
|
||||
/* go backwards across all WS, BN, explicit codes */
|
||||
while (start > 0 && (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
|
||||
--start;
|
||||
}
|
||||
|
||||
/* if the WS run can be merged with the previous run then do so here */
|
||||
while (start > 0 && levels[start - 1] == paraLevel) {
|
||||
--start;
|
||||
}
|
||||
|
||||
bidiBase.trailingWSStart = start;
|
||||
}
|
||||
|
||||
static Bidi setLine(BidiBase paraBidi, Bidi newBidi, BidiBase lineBidi, int start, int limit) {
|
||||
int length;
|
||||
|
||||
/* set the values in lineBidi from its paraBidi parent */
|
||||
/* class members are already initialized to 0 */
|
||||
// lineBidi.paraBidi = null; /* mark unfinished setLine */
|
||||
// lineBidi.flags = 0;
|
||||
// lineBidi.controlCount = 0;
|
||||
|
||||
length = lineBidi.length = lineBidi.originalLength = lineBidi.resultLength = limit - start;
|
||||
|
||||
lineBidi.text = new char[length];
|
||||
System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length);
|
||||
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
|
||||
lineBidi.paraCount = paraBidi.paraCount;
|
||||
lineBidi.runs = new BidiRun[0];
|
||||
lineBidi.reorderingMode = paraBidi.reorderingMode;
|
||||
lineBidi.reorderingOptions = paraBidi.reorderingOptions;
|
||||
if (paraBidi.controlCount > 0) {
|
||||
int j;
|
||||
for (j = start; j < limit; j++) {
|
||||
if (BidiBase.IsBidiControlChar(paraBidi.text[j])) {
|
||||
lineBidi.controlCount++;
|
||||
}
|
||||
}
|
||||
lineBidi.resultLength -= lineBidi.controlCount;
|
||||
}
|
||||
/* copy proper subset of DirProps */
|
||||
lineBidi.getDirPropsMemory(length);
|
||||
lineBidi.dirProps = lineBidi.dirPropsMemory;
|
||||
System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0, length);
|
||||
/* copy proper subset of Levels */
|
||||
lineBidi.getLevelsMemory(length);
|
||||
lineBidi.levels = lineBidi.levelsMemory;
|
||||
System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0, length);
|
||||
lineBidi.runCount = -1;
|
||||
|
||||
if (paraBidi.direction != BidiBase.MIXED) {
|
||||
/* the parent is already trivial */
|
||||
lineBidi.direction = paraBidi.direction;
|
||||
|
||||
/*
|
||||
* The parent's levels are all either implicitly or explicitly ==paraLevel; do
|
||||
* the same here.
|
||||
*/
|
||||
if (paraBidi.trailingWSStart <= start) {
|
||||
lineBidi.trailingWSStart = 0;
|
||||
} else if (paraBidi.trailingWSStart < limit) {
|
||||
lineBidi.trailingWSStart = paraBidi.trailingWSStart - start;
|
||||
} else {
|
||||
lineBidi.trailingWSStart = length;
|
||||
}
|
||||
} else {
|
||||
byte[] levels = lineBidi.levels;
|
||||
int i, trailingWSStart;
|
||||
byte level;
|
||||
|
||||
setTrailingWSStart(lineBidi);
|
||||
trailingWSStart = lineBidi.trailingWSStart;
|
||||
|
||||
/* recalculate lineBidiBase.direction */
|
||||
if (trailingWSStart == 0) {
|
||||
/* all levels are at paraLevel */
|
||||
lineBidi.direction = (byte) (lineBidi.paraLevel & 1);
|
||||
} else {
|
||||
/* get the level of the first character */
|
||||
level = (byte) (levels[0] & 1);
|
||||
|
||||
/*
|
||||
* if there is anything of a different level, then the line is mixed
|
||||
*/
|
||||
if (trailingWSStart < length && (lineBidi.paraLevel & 1) != level) {
|
||||
/*
|
||||
* the trailing WS is at paraLevel, which differs from levels[0]
|
||||
*/
|
||||
lineBidi.direction = BidiBase.MIXED;
|
||||
} else {
|
||||
/*
|
||||
* see if levels[1..trailingWSStart-1] have the same direction as levels[0] and
|
||||
* paraLevel
|
||||
*/
|
||||
for (i = 1;; i++) {
|
||||
if (i == trailingWSStart) {
|
||||
/* the direction values match those in level */
|
||||
lineBidi.direction = level;
|
||||
break;
|
||||
} else if ((levels[i] & 1) != level) {
|
||||
lineBidi.direction = BidiBase.MIXED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (lineBidi.direction) {
|
||||
case Bidi.DIRECTION_LEFT_TO_RIGHT:
|
||||
/* make sure paraLevel is even */
|
||||
lineBidi.paraLevel = (byte) ((lineBidi.paraLevel + 1) & ~1);
|
||||
|
||||
/*
|
||||
* all levels are implicitly at paraLevel (important for getLevels())
|
||||
*/
|
||||
lineBidi.trailingWSStart = 0;
|
||||
break;
|
||||
case Bidi.DIRECTION_RIGHT_TO_LEFT:
|
||||
/* make sure paraLevel is odd */
|
||||
lineBidi.paraLevel |= 1;
|
||||
|
||||
/*
|
||||
* all levels are implicitly at paraLevel (important for getLevels())
|
||||
*/
|
||||
lineBidi.trailingWSStart = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lineBidi.paraBidi = paraBidi; /* mark successful setLine */
|
||||
|
||||
return newBidi;
|
||||
}
|
||||
|
||||
static byte getLevelAt(BidiBase bidiBase, int charIndex) {
|
||||
/* return paraLevel if in the trailing WS run, otherwise the real level */
|
||||
if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) {
|
||||
return bidiBase.GetParaLevelAt(charIndex);
|
||||
} else {
|
||||
return bidiBase.levels[charIndex];
|
||||
}
|
||||
}
|
||||
|
||||
static byte[] getLevels(BidiBase bidiBase) {
|
||||
int start = bidiBase.trailingWSStart;
|
||||
int length = bidiBase.length;
|
||||
|
||||
if (start != length) {
|
||||
/* the current levels array does not reflect the WS run */
|
||||
/*
|
||||
* After the previous if(), we know that the levels array has an implicit
|
||||
* trailing WS run and therefore does not fully reflect itself all the levels.
|
||||
* This must be a Bidi object for a line, and we need to create a new levels
|
||||
* array.
|
||||
*/
|
||||
/*
|
||||
* bidiBase.paraLevel is ok even if contextual multiple paragraphs, since
|
||||
* bidiBase is a line object
|
||||
*/
|
||||
Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel);
|
||||
|
||||
/* this new levels array is set for the line and reflects the WS run */
|
||||
bidiBase.trailingWSStart = length;
|
||||
}
|
||||
if (length < bidiBase.levels.length) {
|
||||
byte[] levels = new byte[length];
|
||||
System.arraycopy(bidiBase.levels, 0, levels, 0, length);
|
||||
return levels;
|
||||
}
|
||||
return bidiBase.levels;
|
||||
}
|
||||
|
||||
static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
|
||||
int start = bidiBase.runs[runIndex].start;
|
||||
int limit;
|
||||
byte level = bidiBase.runs[runIndex].level;
|
||||
|
||||
if (runIndex > 0) {
|
||||
limit = start + bidiBase.runs[runIndex].limit - bidiBase.runs[runIndex - 1].limit;
|
||||
} else {
|
||||
limit = start + bidiBase.runs[0].limit;
|
||||
}
|
||||
return new BidiRun(start, limit, level);
|
||||
}
|
||||
|
||||
/* in trivial cases there is only one trivial run; called by getRuns() */
|
||||
private static void getSingleRun(BidiBase bidiBase, byte level) {
|
||||
/* simple, single-run case */
|
||||
bidiBase.runs = bidiBase.simpleRuns;
|
||||
bidiBase.runCount = 1;
|
||||
|
||||
/* fill and reorder the single run */
|
||||
bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level);
|
||||
}
|
||||
|
||||
/* reorder the runs array (L2) ---------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Reorder the same-level runs in the runs array. Here, runCount>1 and
|
||||
* maxLevel>=minLevel>=paraLevel. All the visualStart fields=logical start
|
||||
* before reordering. The "odd" bits are not set yet.
|
||||
*
|
||||
* Reordering with this data structure lends itself to some handy shortcuts:
|
||||
*
|
||||
* Since each run is moved but not modified, and since at the initial maxLevel
|
||||
* each sequence of same-level runs consists of only one run each, we don't need
|
||||
* to do anything there and can predecrement maxLevel. In many simple cases, the
|
||||
* reordering is thus done entirely in the index mapping. Also, reordering
|
||||
* occurs only down to the lowest odd level that occurs, which is minLevel|1.
|
||||
* However, if the lowest level itself is odd, then in the last reordering the
|
||||
* sequence of the runs at this level or higher will be all runs, and we don't
|
||||
* need the elaborate loop to search for them. This is covered by ++minLevel
|
||||
* instead of minLevel|=1 followed by an extra reorder-all after the
|
||||
* reorder-some loop. About a trailing WS run: Such a run would need special
|
||||
* treatment because its level is not reflected in levels[] if this is not a
|
||||
* paragraph object. Instead, all characters from trailingWSStart on are
|
||||
* implicitly at paraLevel. However, for all maxLevel>paraLevel, this run will
|
||||
* never be reordered and does not need to be taken into account.
|
||||
* maxLevel==paraLevel is only reordered if minLevel==paraLevel is odd, which is
|
||||
* done in the extra segment. This means that for the main reordering loop we
|
||||
* don't need to consider this run and can --runCount. If it is later part of
|
||||
* the all-runs reordering, then runCount is adjusted accordingly.
|
||||
*/
|
||||
private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) {
|
||||
|
||||
/* nothing to do? */
|
||||
if (maxLevel <= (minLevel | 1)) {
|
||||
return;
|
||||
}
|
||||
|
||||
BidiRun[] runs;
|
||||
BidiRun tempRun;
|
||||
byte[] levels;
|
||||
int firstRun, endRun, limitRun, runCount;
|
||||
|
||||
/*
|
||||
* Reorder only down to the lowest odd level and reorder at an odd minLevel in a
|
||||
* separate, simpler loop. See comments above for why minLevel is always
|
||||
* incremented.
|
||||
*/
|
||||
++minLevel;
|
||||
|
||||
runs = bidiBase.runs;
|
||||
levels = bidiBase.levels;
|
||||
runCount = bidiBase.runCount;
|
||||
|
||||
/*
|
||||
* do not include the WS run at paraLevel<=old minLevel except in the simple
|
||||
* loop
|
||||
*/
|
||||
if (bidiBase.trailingWSStart < bidiBase.length) {
|
||||
--runCount;
|
||||
}
|
||||
|
||||
while (--maxLevel >= minLevel) {
|
||||
firstRun = 0;
|
||||
|
||||
/* loop for all sequences of runs */
|
||||
for (;;) {
|
||||
/* look for a sequence of runs that are all at >=maxLevel */
|
||||
/* look for the first run of such a sequence */
|
||||
while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) {
|
||||
++firstRun;
|
||||
}
|
||||
if (firstRun >= runCount) {
|
||||
break; /* no more such runs */
|
||||
}
|
||||
|
||||
/* look for the limit run of such a sequence (the run behind it) */
|
||||
for (limitRun = firstRun; ++limitRun < runCount && levels[runs[limitRun].start] >= maxLevel;) {
|
||||
}
|
||||
|
||||
/* Swap the entire sequence of runs from firstRun to limitRun-1. */
|
||||
endRun = limitRun - 1;
|
||||
while (firstRun < endRun) {
|
||||
tempRun = runs[firstRun];
|
||||
runs[firstRun] = runs[endRun];
|
||||
runs[endRun] = tempRun;
|
||||
++firstRun;
|
||||
--endRun;
|
||||
}
|
||||
|
||||
if (limitRun == runCount) {
|
||||
break; /* no more such runs */
|
||||
} else {
|
||||
firstRun = limitRun + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* now do maxLevel==old minLevel (==odd!), see above */
|
||||
if ((minLevel & 1) == 0) {
|
||||
firstRun = 0;
|
||||
|
||||
/* include the trailing WS run in this complete reordering */
|
||||
if (bidiBase.trailingWSStart == bidiBase.length) {
|
||||
--runCount;
|
||||
}
|
||||
|
||||
/* Swap the entire sequence of all runs. (endRun==runCount) */
|
||||
while (firstRun < runCount) {
|
||||
tempRun = runs[firstRun];
|
||||
runs[firstRun] = runs[runCount];
|
||||
runs[runCount] = tempRun;
|
||||
++firstRun;
|
||||
--runCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the runs array --------------------------------------------------- */
|
||||
|
||||
static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) {
|
||||
BidiRun[] runs = bidiBase.runs;
|
||||
int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart;
|
||||
|
||||
for (i = 0; i < runCount; i++) {
|
||||
length = runs[i].limit - visualStart;
|
||||
logicalStart = runs[i].start;
|
||||
if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart + length))) {
|
||||
return i;
|
||||
}
|
||||
visualStart += length;
|
||||
}
|
||||
/* we should never get here */
|
||||
throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex");
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the runs array from the levels array. After getRuns() returns true,
|
||||
* runCount is guaranteed to be >0 and the runs are reordered. Odd-level runs
|
||||
* have visualStart on their visual right edge and they progress visually to the
|
||||
* left. If option OPTION_INSERT_MARKS is set, insertRemove will contain the sum
|
||||
* of appropriate LRM/RLM_BEFORE/AFTER flags. If option OPTION_REMOVE_CONTROLS
|
||||
* is set, insertRemove will contain the negative number of BiDi control
|
||||
* characters within this run.
|
||||
*/
|
||||
static void getRuns(BidiBase bidiBase) {
|
||||
/*
|
||||
* This method returns immediately if the runs are already set. This includes
|
||||
* the case of length==0 (handled in setPara)..
|
||||
*/
|
||||
if (bidiBase.runCount >= 0) {
|
||||
return;
|
||||
}
|
||||
if (bidiBase.direction != BidiBase.MIXED) {
|
||||
/* simple, single-run case - this covers length==0 */
|
||||
/* bidiBase.paraLevel is ok even for contextual multiple paragraphs */
|
||||
getSingleRun(bidiBase, bidiBase.paraLevel);
|
||||
} else /* BidiBase.MIXED, length>0 */ {
|
||||
/* mixed directionality */
|
||||
int length = bidiBase.length, limit;
|
||||
byte[] levels = bidiBase.levels;
|
||||
int i, runCount;
|
||||
byte level = -1; /* initialize with no valid level */
|
||||
/*
|
||||
* If there are WS characters at the end of the line and the run preceding them
|
||||
* has a level different from paraLevel, then they will form their own run at
|
||||
* paraLevel (L1). Count them separately. We need some special treatment for
|
||||
* this in order to not modify the levels array which a line Bidi object shares
|
||||
* with its paragraph parent and its other line siblings. In other words, for
|
||||
* the trailing WS, it may be levels[]!=paraLevel but we have to treat it like
|
||||
* it were so.
|
||||
*/
|
||||
limit = bidiBase.trailingWSStart;
|
||||
/* count the runs, there is at least one non-WS run, and limit>0 */
|
||||
runCount = 0;
|
||||
for (i = 0; i < limit; ++i) {
|
||||
/* increment runCount at the start of each run */
|
||||
if (levels[i] != level) {
|
||||
++runCount;
|
||||
level = levels[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't need to see if the last run can be merged with a trailing WS run
|
||||
* because setTrailingWSStart() would have done that.
|
||||
*/
|
||||
if (runCount == 1 && limit == length) {
|
||||
/* There is only one non-WS run and no trailing WS-run. */
|
||||
getSingleRun(bidiBase, levels[0]);
|
||||
} else /* runCount>1 || limit<length */ {
|
||||
/* allocate and set the runs */
|
||||
BidiRun[] runs;
|
||||
int runIndex, start;
|
||||
byte minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
|
||||
byte maxLevel = 0;
|
||||
|
||||
/* now, count a (non-mergeable) WS run */
|
||||
if (limit < length) {
|
||||
++runCount;
|
||||
}
|
||||
|
||||
/* runCount > 1 */
|
||||
bidiBase.getRunsMemory(runCount);
|
||||
runs = bidiBase.runsMemory;
|
||||
|
||||
/* set the runs */
|
||||
/*
|
||||
* FOOD FOR THOUGHT: this could be optimized, e.g.: 464->444, 484->444,
|
||||
* 575->555, 595->555 However, that would take longer. Check also how it would
|
||||
* interact with BiDi control removal and inserting Marks.
|
||||
*/
|
||||
runIndex = 0;
|
||||
|
||||
/*
|
||||
* search for the run limits and initialize visualLimit values with the run
|
||||
* lengths
|
||||
*/
|
||||
i = 0;
|
||||
do {
|
||||
/* prepare this run */
|
||||
start = i;
|
||||
level = levels[i];
|
||||
if (level < minLevel) {
|
||||
minLevel = level;
|
||||
}
|
||||
if (level > maxLevel) {
|
||||
maxLevel = level;
|
||||
}
|
||||
|
||||
/* look for the run limit */
|
||||
while (++i < limit && levels[i] == level) {
|
||||
}
|
||||
|
||||
/* i is another run limit */
|
||||
runs[runIndex] = new BidiRun(start, i - start, level);
|
||||
++runIndex;
|
||||
} while (i < limit);
|
||||
|
||||
if (limit < length) {
|
||||
/* there is a separate WS run */
|
||||
runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel);
|
||||
/*
|
||||
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
|
||||
* paragraphs.
|
||||
*/
|
||||
if (bidiBase.paraLevel < minLevel) {
|
||||
minLevel = bidiBase.paraLevel;
|
||||
}
|
||||
}
|
||||
|
||||
/* set the object fields */
|
||||
bidiBase.runs = runs;
|
||||
bidiBase.runCount = runCount;
|
||||
|
||||
reorderLine(bidiBase, minLevel, maxLevel);
|
||||
|
||||
/* now add the direction flags and adjust the visualLimit's to be just that */
|
||||
/* this loop will also handle the trailing WS run */
|
||||
limit = 0;
|
||||
for (i = 0; i < runCount; ++i) {
|
||||
runs[i].level = levels[runs[i].start];
|
||||
limit = (runs[i].limit += limit);
|
||||
}
|
||||
|
||||
/* Set the embedding level for the trailing WS run. */
|
||||
/* For a RTL paragraph, it will be the *first* run in visual order. */
|
||||
/*
|
||||
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
|
||||
* paragraphs.
|
||||
*/
|
||||
if (runIndex < runCount) {
|
||||
int trailingRun = ((bidiBase.paraLevel & 1) != 0) ? 0 : runIndex;
|
||||
runs[trailingRun].level = bidiBase.paraLevel;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* handle insert LRM/RLM BEFORE/AFTER run */
|
||||
if (bidiBase.insertPoints.size > 0) {
|
||||
BidiBase.Point point;
|
||||
int runIndex, ip;
|
||||
for (ip = 0; ip < bidiBase.insertPoints.size; ip++) {
|
||||
point = bidiBase.insertPoints.points[ip];
|
||||
runIndex = getRunFromLogicalIndex(bidiBase, point.pos);
|
||||
bidiBase.runs[runIndex].insertRemove |= point.flag;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle remove BiDi control characters */
|
||||
if (bidiBase.controlCount > 0) {
|
||||
int runIndex, ic;
|
||||
char c;
|
||||
for (ic = 0; ic < bidiBase.length; ic++) {
|
||||
c = bidiBase.text[ic];
|
||||
if (BidiBase.IsBidiControlChar(c)) {
|
||||
runIndex = getRunFromLogicalIndex(bidiBase, ic);
|
||||
bidiBase.runs[runIndex].insertRemove--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel) {
|
||||
int start;
|
||||
byte level, minLevel, maxLevel;
|
||||
|
||||
if (levels == null || levels.length <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/* determine minLevel and maxLevel */
|
||||
minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
|
||||
maxLevel = 0;
|
||||
for (start = levels.length; start > 0;) {
|
||||
level = levels[--start];
|
||||
if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
|
||||
return null;
|
||||
}
|
||||
if (level < minLevel) {
|
||||
minLevel = level;
|
||||
}
|
||||
if (level > maxLevel) {
|
||||
maxLevel = level;
|
||||
}
|
||||
}
|
||||
pMinLevel[0] = minLevel;
|
||||
pMaxLevel[0] = maxLevel;
|
||||
|
||||
/* initialize the index map */
|
||||
int[] indexMap = new int[levels.length];
|
||||
for (start = levels.length; start > 0;) {
|
||||
--start;
|
||||
indexMap[start] = start;
|
||||
}
|
||||
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
static int[] reorderVisual(byte[] levels) {
|
||||
byte[] aMinLevel = new byte[1];
|
||||
byte[] aMaxLevel = new byte[1];
|
||||
int start, end, limit, temp;
|
||||
byte minLevel, maxLevel;
|
||||
|
||||
int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel);
|
||||
if (indexMap == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
minLevel = aMinLevel[0];
|
||||
maxLevel = aMaxLevel[0];
|
||||
|
||||
/* nothing to do? */
|
||||
if (minLevel == maxLevel && (minLevel & 1) == 0) {
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
/* reorder only down to the lowest odd level */
|
||||
minLevel |= 1;
|
||||
|
||||
/* loop maxLevel..minLevel */
|
||||
do {
|
||||
start = 0;
|
||||
|
||||
/* loop for all sequences of levels to reorder at the current maxLevel */
|
||||
for (;;) {
|
||||
/* look for a sequence of levels that are all at >=maxLevel */
|
||||
/* look for the first index of such a sequence */
|
||||
while (start < levels.length && levels[start] < maxLevel) {
|
||||
++start;
|
||||
}
|
||||
if (start >= levels.length) {
|
||||
break; /* no more such runs */
|
||||
}
|
||||
|
||||
/* look for the limit of such a sequence (the index behind it) */
|
||||
for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel;) {
|
||||
}
|
||||
|
||||
/*
|
||||
* Swap the entire interval of indexes from start to limit-1. We don't need to
|
||||
* swap the levels for the purpose of this algorithm: the sequence of levels
|
||||
* that we look at does not move anyway.
|
||||
*/
|
||||
end = limit - 1;
|
||||
while (start < end) {
|
||||
temp = indexMap[start];
|
||||
indexMap[start] = indexMap[end];
|
||||
indexMap[end] = temp;
|
||||
|
||||
++start;
|
||||
--end;
|
||||
}
|
||||
|
||||
if (limit == levels.length) {
|
||||
break; /* no more such sequences */
|
||||
} else {
|
||||
start = limit + 1;
|
||||
}
|
||||
}
|
||||
} while (--maxLevel >= minLevel);
|
||||
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
static int[] getVisualMap(BidiBase bidiBase) {
|
||||
/* fill a visual-to-logical index map using the runs[] */
|
||||
BidiRun[] runs = bidiBase.runs;
|
||||
int logicalStart, visualStart, visualLimit;
|
||||
int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length : bidiBase.resultLength;
|
||||
int[] indexMap = new int[allocLength];
|
||||
|
||||
visualStart = 0;
|
||||
int idx = 0;
|
||||
for (int j = 0; j < bidiBase.runCount; ++j) {
|
||||
logicalStart = runs[j].start;
|
||||
visualLimit = runs[j].limit;
|
||||
if (runs[j].isEvenRun()) {
|
||||
do { /* LTR */
|
||||
indexMap[idx++] = logicalStart++;
|
||||
} while (++visualStart < visualLimit);
|
||||
} else {
|
||||
logicalStart += visualLimit - visualStart; /* logicalLimit */
|
||||
do { /* RTL */
|
||||
indexMap[idx++] = --logicalStart;
|
||||
} while (++visualStart < visualLimit);
|
||||
}
|
||||
/* visualStart==visualLimit; */
|
||||
}
|
||||
|
||||
if (bidiBase.insertPoints.size > 0) {
|
||||
int markFound = 0, runCount = bidiBase.runCount;
|
||||
int insertRemove, i, j, k;
|
||||
runs = bidiBase.runs;
|
||||
/* count all inserted marks */
|
||||
for (i = 0; i < runCount; i++) {
|
||||
insertRemove = runs[i].insertRemove;
|
||||
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
|
||||
markFound++;
|
||||
}
|
||||
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
|
||||
markFound++;
|
||||
}
|
||||
}
|
||||
/* move back indexes by number of preceding marks */
|
||||
k = bidiBase.resultLength;
|
||||
for (i = runCount - 1; i >= 0 && markFound > 0; i--) {
|
||||
insertRemove = runs[i].insertRemove;
|
||||
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
|
||||
indexMap[--k] = BidiBase.MAP_NOWHERE;
|
||||
markFound--;
|
||||
}
|
||||
visualStart = i > 0 ? runs[i - 1].limit : 0;
|
||||
for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) {
|
||||
indexMap[--k] = indexMap[j];
|
||||
}
|
||||
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
|
||||
indexMap[--k] = BidiBase.MAP_NOWHERE;
|
||||
markFound--;
|
||||
}
|
||||
}
|
||||
} else if (bidiBase.controlCount > 0) {
|
||||
int runCount = bidiBase.runCount, logicalEnd;
|
||||
int insertRemove, length, i, j, k, m;
|
||||
char uchar;
|
||||
boolean evenRun;
|
||||
runs = bidiBase.runs;
|
||||
visualStart = 0;
|
||||
/* move forward indexes by number of preceding controls */
|
||||
k = 0;
|
||||
for (i = 0; i < runCount; i++, visualStart += length) {
|
||||
length = runs[i].limit - visualStart;
|
||||
insertRemove = runs[i].insertRemove;
|
||||
/* if no control found yet, nothing to do in this run */
|
||||
if ((insertRemove == 0) && (k == visualStart)) {
|
||||
k += length;
|
||||
continue;
|
||||
}
|
||||
/* if no control in this run */
|
||||
if (insertRemove == 0) {
|
||||
visualLimit = runs[i].limit;
|
||||
for (j = visualStart; j < visualLimit; j++) {
|
||||
indexMap[k++] = indexMap[j];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
logicalStart = runs[i].start;
|
||||
evenRun = runs[i].isEvenRun();
|
||||
logicalEnd = logicalStart + length - 1;
|
||||
for (j = 0; j < length; j++) {
|
||||
m = evenRun ? logicalStart + j : logicalEnd - j;
|
||||
uchar = bidiBase.text[m];
|
||||
if (!BidiBase.IsBidiControlChar(uchar)) {
|
||||
indexMap[k++] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allocLength == bidiBase.resultLength) {
|
||||
return indexMap;
|
||||
}
|
||||
int[] newMap = new int[bidiBase.resultLength];
|
||||
System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength);
|
||||
return newMap;
|
||||
}
|
||||
|
||||
}
|
123
sources/main/java/jdk_internal/icu/text/BidiRun.java
Normal file
123
sources/main/java/jdk_internal/icu/text/BidiRun.java
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* A BidiRun represents a sequence of characters at the same embedding level.
|
||||
* The Bidi algorithm decomposes a piece of text into sequences of characters at
|
||||
* the same embedding level, each such sequence is called a "run".
|
||||
*
|
||||
* <p>
|
||||
* A BidiRun represents such a run by storing its essential properties, but does
|
||||
* not duplicate the characters which form the run.
|
||||
*
|
||||
* <p>
|
||||
* The "limit" of the run is the position just after the last
|
||||
* character, i.e., one more than that position.
|
||||
*
|
||||
* <p>
|
||||
* This class has no public constructor, and its members cannot be modified by
|
||||
* users.
|
||||
*
|
||||
* @see com.ibm.icu.text.Bidi
|
||||
*/
|
||||
class BidiRun {
|
||||
|
||||
int start; /* first logical position of the run */
|
||||
int limit; /* last visual position of the run +1 */
|
||||
int insertRemove; /*
|
||||
* if >0, flags for inserting LRM/RLM before/after run, if <0, count of bidi
|
||||
* controls within run
|
||||
*/
|
||||
byte level;
|
||||
|
||||
/*
|
||||
* Default constructor
|
||||
*
|
||||
* Note that members start and limit of a run instance have different meanings
|
||||
* depending whether the run is part of the runs array of a Bidi object, or if
|
||||
* it is a reference returned by getVisualRun() or getLogicalRun(). For a member
|
||||
* of the runs array of a Bidi object, - start is the first logical position of
|
||||
* the run in the source text. - limit is one after the last visual position of
|
||||
* the run. For a reference returned by getLogicalRun() or getVisualRun(), -
|
||||
* start is the first logical position of the run in the source text. - limit is
|
||||
* one after the last logical position of the run.
|
||||
*/
|
||||
BidiRun() {
|
||||
this(0, 0, (byte) 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructor
|
||||
*/
|
||||
BidiRun(int start, int limit, byte embeddingLevel) {
|
||||
this.start = start;
|
||||
this.limit = limit;
|
||||
this.level = embeddingLevel;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the content of a BidiRun instance
|
||||
*/
|
||||
void copyFrom(BidiRun run) {
|
||||
this.start = run.start;
|
||||
this.limit = run.limit;
|
||||
this.level = run.level;
|
||||
this.insertRemove = run.insertRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get level of run
|
||||
*/
|
||||
byte getEmbeddingLevel() {
|
||||
return level;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if run level is even
|
||||
*
|
||||
* @return true if the embedding level of this run is even, i.e. it is a
|
||||
* left-to-right run.
|
||||
*/
|
||||
boolean isEvenRun() {
|
||||
return (level & 1) == 0;
|
||||
}
|
||||
|
||||
}
|
425
sources/main/java/jdk_internal/icu/text/BidiWriter.java
Normal file
425
sources/main/java/jdk_internal/icu/text/BidiWriter.java
Normal file
@ -0,0 +1,425 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
|
||||
final class BidiWriter {
|
||||
|
||||
/** Bidi control code points */
|
||||
static final char LRM_CHAR = 0x200e;
|
||||
static final char RLM_CHAR = 0x200f;
|
||||
static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | 1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
|
||||
|
||||
private static boolean IsCombining(int type) {
|
||||
return ((1 << type & (1 << UCharacter.NON_SPACING_MARK | 1 << UCharacter.COMBINING_SPACING_MARK
|
||||
| 1 << UCharacter.ENCLOSING_MARK)) != 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* When we have OUTPUT_REVERSE set on writeReordered(), then we semantically
|
||||
* write RTL runs in reverse and later reverse them again. Instead, we actually
|
||||
* write them in forward order to begin with. However, if the RTL run was to be
|
||||
* mirrored, we need to mirror here now since the implicit second reversal must
|
||||
* not do it. It looks strange to do mirroring in LTR output, but it is only
|
||||
* because we are writing RTL output in reverse.
|
||||
*/
|
||||
private static String doWriteForward(String src, int options) {
|
||||
/* optimize for several combinations of options */
|
||||
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING)) {
|
||||
case 0: {
|
||||
/* simply return the LTR run */
|
||||
return src;
|
||||
}
|
||||
case BidiBase.DO_MIRRORING: {
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* do mirroring */
|
||||
int i = 0;
|
||||
int c;
|
||||
|
||||
do {
|
||||
c = UTF16.charAt(src, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
UTF16.append(dest, UCharacter.getMirror(c));
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
case BidiBase.REMOVE_BIDI_CONTROLS: {
|
||||
StringBuilder dest = new StringBuilder(src.length());
|
||||
|
||||
/* copy the LTR run and remove any Bidi control characters */
|
||||
int i = 0;
|
||||
char c;
|
||||
do {
|
||||
c = src.charAt(i++);
|
||||
if (!BidiBase.IsBidiControlChar(c)) {
|
||||
dest.append(c);
|
||||
}
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
default: {
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* remove Bidi control characters and do mirroring */
|
||||
int i = 0;
|
||||
int c;
|
||||
do {
|
||||
c = UTF16.charAt(src, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (!BidiBase.IsBidiControlChar(c)) {
|
||||
UTF16.append(dest, UCharacter.getMirror(c));
|
||||
}
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
} /* end of switch */
|
||||
}
|
||||
|
||||
private static String doWriteForward(char[] text, int start, int limit, int options) {
|
||||
return doWriteForward(new String(text, start, limit - start), options);
|
||||
}
|
||||
|
||||
static String writeReverse(String src, int options) {
|
||||
/*
|
||||
* RTL run -
|
||||
*
|
||||
* RTL runs need to be copied to the destination in reverse order of code
|
||||
* points, not code units, to keep Unicode characters intact.
|
||||
*
|
||||
* The general strategy for this is to read the source text in backward order,
|
||||
* collect all code units for a code point (and optionally following combining
|
||||
* characters, see below), and copy all these code units in ascending order to
|
||||
* the destination for this run.
|
||||
*
|
||||
* Several options request whether combining characters should be kept after
|
||||
* their base characters, whether Bidi control characters should be removed, and
|
||||
* whether characters should be replaced by their mirror-image equivalent
|
||||
* Unicode characters.
|
||||
*/
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* optimize for several combinations of options */
|
||||
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING | BidiBase.KEEP_BASE_COMBINING)) {
|
||||
|
||||
case 0:
|
||||
/*
|
||||
* With none of the "complicated" options set, the destination run will have the
|
||||
* same length as the source run, and there is no mirroring and no keeping
|
||||
* combining characters with their base characters.
|
||||
*
|
||||
* XXX: or dest = UTF16.reverse(new StringBuffer(src));
|
||||
*/
|
||||
|
||||
int srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int i = srcLength;
|
||||
|
||||
/* collect code units for one base character */
|
||||
srcLength -= UTF16.getCharCount(UTF16.charAt(src, srcLength - 1));
|
||||
|
||||
/* copy this base character */
|
||||
dest.append(src.substring(srcLength, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
|
||||
case BidiBase.KEEP_BASE_COMBINING:
|
||||
/*
|
||||
* Here, too, the destination run will have the same length as the source run,
|
||||
* and there is no mirroring. We do need to keep combining characters with their
|
||||
* base characters.
|
||||
*/
|
||||
srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int c;
|
||||
int i = srcLength;
|
||||
|
||||
/*
|
||||
* collect code units and modifier letters for one base character
|
||||
*/
|
||||
do {
|
||||
c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
} while (srcLength > 0 && IsCombining(UCharacter.getType(c)));
|
||||
|
||||
/* copy this "user character" */
|
||||
dest.append(src.substring(srcLength, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* With several "complicated" options set, this is the most general and the
|
||||
* slowest copying of an RTL run. We will do mirroring, remove Bidi controls,
|
||||
* and keep combining characters with their base characters as requested.
|
||||
*/
|
||||
srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int i = srcLength;
|
||||
|
||||
/* collect code units for one base character */
|
||||
int c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
|
||||
/* collect modifier letters for this base character */
|
||||
while (srcLength > 0 && IsCombining(UCharacter.getType(c))) {
|
||||
c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
}
|
||||
}
|
||||
|
||||
if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && BidiBase.IsBidiControlChar(c)) {
|
||||
/* do not copy this Bidi control character */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* copy this "user character" */
|
||||
int j = srcLength;
|
||||
if ((options & BidiBase.DO_MIRRORING) != 0) {
|
||||
/* mirror only the base character */
|
||||
c = UCharacter.getMirror(c);
|
||||
UTF16.append(dest, c);
|
||||
j += UTF16.getCharCount(c);
|
||||
}
|
||||
dest.append(src.substring(j, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
} /* end of switch */
|
||||
|
||||
return dest.toString();
|
||||
}
|
||||
|
||||
static String doWriteReverse(char[] text, int start, int limit, int options) {
|
||||
return writeReverse(new String(text, start, limit - start), options);
|
||||
}
|
||||
|
||||
static String writeReordered(BidiBase bidi, int options) {
|
||||
int run, runCount;
|
||||
StringBuilder dest;
|
||||
char[] text = bidi.text;
|
||||
runCount = bidi.countRuns();
|
||||
|
||||
/*
|
||||
* Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
|
||||
* reordering mode (checked below) is appropriate.
|
||||
*/
|
||||
if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
|
||||
options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
|
||||
}
|
||||
/*
|
||||
* Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS and cancels
|
||||
* BidiBase.INSERT_LRM_FOR_NUMERIC.
|
||||
*/
|
||||
if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
|
||||
options |= BidiBase.REMOVE_BIDI_CONTROLS;
|
||||
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
}
|
||||
/*
|
||||
* If we do not perform the "inverse Bidi" algorithm, then we don't need to
|
||||
* insert any LRMs, and don't need to test for it.
|
||||
*/
|
||||
if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
|
||||
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
}
|
||||
dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? bidi.length * 2 : bidi.length);
|
||||
/*
|
||||
* Iterate through all visual runs and copy the run text segments to the
|
||||
* destination, according to the options.
|
||||
*
|
||||
* The tests for where to insert LRMs ignore the fact that there may be BN codes
|
||||
* or non-BMP code points at the beginning and end of a run; they may insert
|
||||
* LRMs unnecessarily but the tests are faster this way (this would have to be
|
||||
* improved for UTF-8).
|
||||
*/
|
||||
if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
|
||||
/* forward output */
|
||||
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
|
||||
/* do not insert Bidi controls */
|
||||
for (run = 0; run < runCount; ++run) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
dest.append(
|
||||
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
} else {
|
||||
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* insert Bidi controls for "inverse Bidi" */
|
||||
byte[] dirProps = bidi.dirProps;
|
||||
char uc;
|
||||
int markFlag;
|
||||
|
||||
for (run = 0; run < runCount; ++run) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
markFlag = 0;
|
||||
/* check if something relevant in insertPoints */
|
||||
markFlag = bidi.runs[run].insertRemove;
|
||||
if (markFlag < 0) { /* bidi controls count */
|
||||
markFlag = 0;
|
||||
}
|
||||
if (bidiRun.isEvenRun()) {
|
||||
if (bidi.isInverse() && dirProps[bidiRun.start] != BidiBase.L) {
|
||||
markFlag |= BidiBase.LRM_BEFORE;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
dest.append(
|
||||
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
|
||||
if (bidi.isInverse() && dirProps[bidiRun.limit - 1] != BidiBase.L) {
|
||||
markFlag |= BidiBase.LRM_AFTER;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
} else { /* RTL run */
|
||||
if (bidi.isInverse() && !bidi.testDirPropFlagAt(MASK_R_AL, bidiRun.limit - 1)) {
|
||||
markFlag |= BidiBase.RLM_BEFORE;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
|
||||
|
||||
if (bidi.isInverse() && (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
|
||||
markFlag |= BidiBase.RLM_AFTER;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* reverse output */
|
||||
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
|
||||
/* do not insert Bidi controls */
|
||||
for (run = runCount; --run >= 0;) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
dest.append(
|
||||
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
} else {
|
||||
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* insert Bidi controls for "inverse Bidi" */
|
||||
|
||||
byte[] dirProps = bidi.dirProps;
|
||||
|
||||
for (run = runCount; --run >= 0;) {
|
||||
/* reverse output */
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
|
||||
dest.append(LRM_CHAR);
|
||||
}
|
||||
|
||||
dest.append(
|
||||
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
|
||||
if (dirProps[bidiRun.start] != BidiBase.L) {
|
||||
dest.append(LRM_CHAR);
|
||||
}
|
||||
} else {
|
||||
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
|
||||
dest.append(RLM_CHAR);
|
||||
}
|
||||
|
||||
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
|
||||
|
||||
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
|
||||
dest.append(RLM_CHAR);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dest.toString();
|
||||
}
|
||||
}
|
271
sources/main/java/jdk_internal/icu/text/FilteredNormalizer2.java
Normal file
271
sources/main/java/jdk_internal/icu/text/FilteredNormalizer2.java
Normal file
@ -0,0 +1,271 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Normalization filtered by a UnicodeSet. Normalizes portions of the text
|
||||
* contained in the filter set and leaves portions not contained in the filter
|
||||
* set unchanged. Filtering is done via UnicodeSet.span(...,
|
||||
* UnicodeSet.SpanCondition.SIMPLE). Not-in-the-filter text is treated as "is
|
||||
* normalized" and "quick check yes". This class implements all of (and only)
|
||||
* the Normalizer2 API. An instance of this class is unmodifiable/immutable.
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
* @author Markus W. Scherer
|
||||
*/
|
||||
class FilteredNormalizer2 extends Normalizer2 {
|
||||
|
||||
/**
|
||||
* Constructs a filtered normalizer wrapping any Normalizer2 instance and a
|
||||
* filter set. Both are aliased and must not be modified or deleted while this
|
||||
* object is used. The filter set should be frozen; otherwise the performance
|
||||
* will suffer greatly.
|
||||
*
|
||||
* @param n2 wrapped Normalizer2 instance
|
||||
* @param filterSet UnicodeSet which determines the characters to be normalized
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
|
||||
norm2 = n2;
|
||||
set = filterSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
dest.setLength(0);
|
||||
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
@Override
|
||||
public Appendable normalize(CharSequence src, Appendable dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder append(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
@Override
|
||||
public String getDecomposition(int c) {
|
||||
return set.contains(c) ? norm2.getDecomposition(c) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 49
|
||||
*/
|
||||
@Override
|
||||
public int getCombiningClass(int c) {
|
||||
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public boolean isNormalized(CharSequence s) {
|
||||
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
|
||||
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
if (!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
|
||||
return false;
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public int spanQuickCheckYes(CharSequence s) {
|
||||
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
|
||||
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
int yesLimit = prevSpanLimit + norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
|
||||
if (yesLimit < spanLimit) {
|
||||
return yesLimit;
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
return s.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) {
|
||||
return !set.contains(c) || norm2.hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
// Internal: No argument checking, and appends to dest.
|
||||
// Pass as input spanCondition the one that is likely to yield a non-zero
|
||||
// span length at the start of src.
|
||||
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
|
||||
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
|
||||
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue
|
||||
// after
|
||||
// an in-filter prefix.
|
||||
private Appendable normalize(CharSequence src, Appendable dest, UnicodeSet.SpanCondition spanCondition) {
|
||||
// Don't throw away destination buffer between iterations.
|
||||
StringBuilder tempDest = new StringBuilder();
|
||||
try {
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < src.length();) {
|
||||
int spanLimit = set.span(src, prevSpanLimit, spanCondition);
|
||||
int spanLength = spanLimit - prevSpanLimit;
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
if (spanLength != 0) {
|
||||
dest.append(src, prevSpanLimit, spanLimit);
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
if (spanLength != 0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, boolean doNormalize) {
|
||||
if (first == second) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (first.length() == 0) {
|
||||
if (doNormalize) {
|
||||
return normalize(second, first);
|
||||
} else {
|
||||
return first.append(second);
|
||||
}
|
||||
}
|
||||
// merge the in-filter suffix of the first string with the in-filter prefix of
|
||||
// the second
|
||||
int prefixLimit = set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
|
||||
if (prefixLimit != 0) {
|
||||
CharSequence prefix = second.subSequence(0, prefixLimit);
|
||||
int suffixStart = set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
|
||||
if (suffixStart == 0) {
|
||||
if (doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(first, prefix);
|
||||
} else {
|
||||
norm2.append(first, prefix);
|
||||
}
|
||||
} else {
|
||||
StringBuilder middle = new StringBuilder(first.subSequence(suffixStart, first.length()));
|
||||
if (doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(middle, prefix);
|
||||
} else {
|
||||
norm2.append(middle, prefix);
|
||||
}
|
||||
first.delete(suffixStart, 0x7fffffff).append(middle);
|
||||
}
|
||||
}
|
||||
if (prefixLimit < second.length()) {
|
||||
CharSequence rest = second.subSequence(prefixLimit, second.length());
|
||||
if (doNormalize) {
|
||||
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
|
||||
} else {
|
||||
first.append(rest);
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
private Normalizer2 norm2;
|
||||
private UnicodeSet set;
|
||||
};
|
288
sources/main/java/jdk_internal/icu/text/Normalizer2.java
Normal file
288
sources/main/java/jdk_internal/icu/text/Normalizer2.java
Normal file
@ -0,0 +1,288 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.impl.Norm2AllModes;
|
||||
|
||||
/**
|
||||
* Unicode normalization functionality for standard Unicode normalization or for
|
||||
* using custom mapping tables. All instances of this class are
|
||||
* unmodifiable/immutable. The Normalizer2 class is not intended for public
|
||||
* subclassing.
|
||||
* <p>
|
||||
* The primary functions are to produce a normalized string and to detect
|
||||
* whether a string is already normalized. The most commonly used normalization
|
||||
* forms are those defined in
|
||||
* <a href="http://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
|
||||
* Unicode Normalization Forms</a>. However, this API supports additional
|
||||
* normalization forms for specialized purposes. For example, NFKC_Casefold is
|
||||
* provided via getInstance("nfkc_cf", COMPOSE) and can be used in
|
||||
* implementations of UTS #46.
|
||||
* <p>
|
||||
* Not only are the standard compose and decompose modes supplied, but
|
||||
* additional modes are provided as documented in the Mode enum.
|
||||
* <p>
|
||||
* Some of the functions in this class identify normalization boundaries. At a
|
||||
* normalization boundary, the portions of the string before it and starting
|
||||
* from it do not interact and can be handled independently.
|
||||
* <p>
|
||||
* The spanQuickCheckYes() stops at a normalization boundary. When the goal is a
|
||||
* normalized string, then the text before the boundary can be copied, and the
|
||||
* remainder can be processed with normalizeSecondAndAppend().
|
||||
* <p>
|
||||
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test
|
||||
* whether a character is guaranteed to be at a normalization boundary,
|
||||
* regardless of context. This is used for moving from one normalization
|
||||
* boundary to the next or preceding boundary, and for performing iterative
|
||||
* normalization.
|
||||
* <p>
|
||||
* Iterative normalization is useful when only a small portion of a longer
|
||||
* string needs to be processed. For example, in ICU, iterative normalization is
|
||||
* used by the NormalizationTransliterator (to avoid replacing
|
||||
* already-normalized text) and ucol_nextSortKeyPart() (to process only the
|
||||
* substring for which sort key bytes are computed).
|
||||
* <p>
|
||||
* The set of normalization boundaries returned by these functions may not be
|
||||
* complete: There may be more boundaries that could be returned. Different
|
||||
* functions may return different boundaries.
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
* @author Markus W. Scherer
|
||||
*/
|
||||
public abstract class Normalizer2 {
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFC normalization. Same as
|
||||
* getInstance(null, "nfc", Mode.COMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFCInstance() {
|
||||
return Norm2AllModes.getNFCInstance().comp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFD normalization. Same as
|
||||
* getInstance(null, "nfc", Mode.DECOMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFDInstance() {
|
||||
return Norm2AllModes.getNFCInstance().decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFKC normalization. Same as
|
||||
* getInstance(null, "nfkc", Mode.COMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFKCInstance() {
|
||||
return Norm2AllModes.getNFKCInstance().comp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFKD normalization. Same as
|
||||
* getInstance(null, "nfkc", Mode.DECOMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFKDInstance() {
|
||||
return Norm2AllModes.getNFKCInstance().decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the normalized form of the source string.
|
||||
*
|
||||
* @param src source string
|
||||
* @return normalized src
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public String normalize(CharSequence src) {
|
||||
if (src instanceof String) {
|
||||
// Fastpath: Do not construct a new String if the src is a String
|
||||
// and is already normalized.
|
||||
int spanLength = spanQuickCheckYes(src);
|
||||
if (spanLength == src.length()) {
|
||||
return (String) src;
|
||||
}
|
||||
if (spanLength != 0) {
|
||||
StringBuilder sb = new StringBuilder(src.length()).append(src, 0, spanLength);
|
||||
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
|
||||
}
|
||||
}
|
||||
return normalize(src, new StringBuilder(src.length())).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination string
|
||||
* (replacing its contents) and returns the destination string. The source and
|
||||
* destination strings must be different objects.
|
||||
*
|
||||
* @param src source string
|
||||
* @param dest destination string; its contents is replaced with normalized src
|
||||
* @return dest
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination Appendable
|
||||
* and returns the destination Appendable. The source and destination strings
|
||||
* must be different objects.
|
||||
*
|
||||
* <p>
|
||||
* Any {@link java.io.IOException} is wrapped into a
|
||||
* {@link com.ibm.icu.util.ICUUncheckedIOException}.
|
||||
*
|
||||
* @param src source string
|
||||
* @param dest destination Appendable; gets normalized src appended
|
||||
* @return dest
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public abstract Appendable normalize(CharSequence src, Appendable dest);
|
||||
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string (merging
|
||||
* them at the boundary) and returns the first string. The result is normalized
|
||||
* if the first string was normalized. The first and second strings must be
|
||||
* different objects.
|
||||
*
|
||||
* @param first string, should be normalized
|
||||
* @param second string, will be normalized
|
||||
* @return first
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second);
|
||||
|
||||
/**
|
||||
* Appends the second string to the first string (merging them at the boundary)
|
||||
* and returns the first string. The result is normalized if both the strings
|
||||
* were normalized. The first and second strings must be different objects.
|
||||
*
|
||||
* @param first string, should be normalized
|
||||
* @param second string, should be normalized
|
||||
* @return first
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder append(StringBuilder first, CharSequence second);
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Roughly equivalent to normalizing the
|
||||
* String form of c on a DECOMPOSE Normalizer2 instance, but much faster, and
|
||||
* except that this function returns null if c does not have a decomposition
|
||||
* mapping in this instance's data. This function is independent of the mode of
|
||||
* the Normalizer2.
|
||||
*
|
||||
* @param c code point
|
||||
* @return c's decomposition mapping, if any; otherwise null
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public abstract String getDecomposition(int c);
|
||||
|
||||
/**
|
||||
* Gets the combining class of c. The default implementation returns 0 but all
|
||||
* standard implementations return the Unicode Canonical_Combining_Class value.
|
||||
*
|
||||
* @param c code point
|
||||
* @return c's combining class
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public int getCombiningClass(int c) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized. Internally, in cases where the
|
||||
* quickCheck() method would return "maybe" (which is only possible for the two
|
||||
* COMPOSE modes) this method resolves to "yes" or "no" to provide a definitive
|
||||
* result, at the cost of doing more work in those cases.
|
||||
*
|
||||
* @param s input string
|
||||
* @return true if s is normalized
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract boolean isNormalized(CharSequence s);
|
||||
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string. In other
|
||||
* words, with <code>end=spanQuickCheckYes(s);</code> the substring
|
||||
* <code>s.subSequence(0, end)</code> will pass the quick check with a "yes"
|
||||
* result.
|
||||
* <p>
|
||||
* The returned end index is usually one or more characters before the "no" or
|
||||
* "maybe" character: The end index is at a normalization boundary. (See the
|
||||
* class documentation for more about normalization boundaries.)
|
||||
* <p>
|
||||
* When the goal is a normalized string and most input strings are expected to
|
||||
* be normalized already, then call this method, and if it returns a prefix
|
||||
* shorter than the input string, copy that prefix and use
|
||||
* normalizeSecondAndAppend() for the remainder.
|
||||
*
|
||||
* @param s input string
|
||||
* @return "yes" span end index
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract int spanQuickCheckYes(CharSequence s);
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary before it,
|
||||
* regardless of context. If true, then the character does not
|
||||
* normalization-interact with preceding characters. In other words, a string
|
||||
* containing this character can be normalized by processing portions before
|
||||
* this character and starting from this character independently. This is used
|
||||
* for iterative normalization. See the class documentation for details.
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c has a normalization boundary before it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract boolean hasBoundaryBefore(int c);
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass constructors, typically
|
||||
* implicit.)
|
||||
*
|
||||
* @internal deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected Normalizer2() {
|
||||
}
|
||||
}
|
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
@ -0,0 +1,791 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2000-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.bidi.Normalizer;
|
||||
import jdk_internal.icu.impl.Norm2AllModes;
|
||||
|
||||
/**
|
||||
* Unicode Normalization
|
||||
*
|
||||
* <h2>Unicode normalization API</h2>
|
||||
*
|
||||
* <code>normalize</code> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <code>normalize</code> supports the standard normalization forms described in
|
||||
* <a href="http://www.unicode.org/reports/tr15/" target="unicode"> Unicode
|
||||
* Standard Annex #15 — Unicode Normalization Forms</a>.
|
||||
*
|
||||
* Characters with accents or other adornments can be encoded in several
|
||||
* different ways in Unicode. For example, take the character A-acute. In
|
||||
* Unicode, this can be encoded as a single character (the "composed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
||||
* </pre>
|
||||
*
|
||||
* or as two separate characters (the "decomposed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 0041 LATIN CAPITAL LETTER A
|
||||
* 0301 COMBINING ACUTE ACCENT
|
||||
* </pre>
|
||||
*
|
||||
* To a user of your program, however, both of these sequences should be treated
|
||||
* as the same "user-level" character "A with acute accent". When you are
|
||||
* searching or comparing text, you must ensure that these two sequences are
|
||||
* treated equivalently. In addition, you must handle characters with more than
|
||||
* one accent. Sometimes the order of a character's combining accents is
|
||||
* significant, while in other cases accent sequences in different orders are
|
||||
* really equivalent.
|
||||
*
|
||||
* Similarly, the string "ffi" can be encoded as three separate letters:
|
||||
*
|
||||
* <pre>
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0069 LATIN SMALL LETTER I
|
||||
* </pre>
|
||||
*
|
||||
* or as the single character
|
||||
*
|
||||
* <pre>
|
||||
* FB03 LATIN SMALL LIGATURE FFI
|
||||
* </pre>
|
||||
*
|
||||
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
||||
* it shouldn't be in Unicode at all, but it was included for compatibility with
|
||||
* existing character sets that already provided it. The Unicode standard
|
||||
* identifies such characters by giving them "compatibility" decompositions into
|
||||
* the corresponding semantic characters. When sorting and searching, you will
|
||||
* often want to use these mappings.
|
||||
*
|
||||
* <code>normalize</code> helps solve these problems by transforming text into
|
||||
* the canonical composed and decomposed forms as shown in the first example
|
||||
* above. In addition, you can have it perform compatibility decompositions so
|
||||
* that you can treat compatibility characters the same as their equivalents.
|
||||
* Finally, <code>normalize</code> rearranges accents into the proper canonical
|
||||
* order, so that you do not have to worry about accent rearrangement on your
|
||||
* own.
|
||||
*
|
||||
* Form FCD, "Fast C or D", is also designed for collation. It allows to work on
|
||||
* strings that are not necessarily normalized with an algorithm (like in
|
||||
* collation) that works under "canonical closure", i.e., it treats precomposed
|
||||
* characters and their decomposed equivalents the same.
|
||||
*
|
||||
* It is not a normalization form because it does not provide for uniqueness of
|
||||
* representation. Multiple strings may be canonically equivalent (their NFDs
|
||||
* are identical) and may all conform to FCD without being identical themselves.
|
||||
*
|
||||
* The form is defined such that the "raw decomposition", the recursive
|
||||
* canonical decomposition of each character, results in a string that is
|
||||
* canonically ordered. This means that precomposed characters are allowed for
|
||||
* as long as their decompositions do not need canonical reordering.
|
||||
*
|
||||
* Its advantage for a process like collation is that all NFD and most NFC texts
|
||||
* - and many unnormalized texts - already conform to FCD and do not need to be
|
||||
* normalized (NFD) for such a process. The FCD quick check will return YES for
|
||||
* most strings in practice.
|
||||
*
|
||||
* normalize(FCD) may be implemented with NFD.
|
||||
*
|
||||
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence
|
||||
* in Applications): http://www.unicode.org/notes/tn5/#FCD
|
||||
*
|
||||
* ICU collation performs either NFD or FCD normalization automatically if
|
||||
* normalization is turned on for the collator object. Beyond collation and
|
||||
* string search, normalized strings may be useful for string equivalence
|
||||
* comparisons, transliteration/transcription, unique representations, etc.
|
||||
*
|
||||
* The W3C generally recommends to exchange texts in NFC. Note also that most
|
||||
* legacy character encodings use only precomposed forms and often do not encode
|
||||
* any combining marks by themselves. For conversion to such character encodings
|
||||
* the Unicode text needs to be normalized to NFC. For more usage examples, see
|
||||
* the Unicode Standard Annex.
|
||||
*
|
||||
* Note: The Normalizer class also provides API for iterative normalization.
|
||||
* While the setIndex() and getIndex() refer to indices in the underlying
|
||||
* Unicode input text, the next() and previous() methods iterate through
|
||||
* characters in the normalized output. This means that there is not necessarily
|
||||
* a one-to-one correspondence between characters returned by next() and
|
||||
* previous() and the indices passed to and returned from setIndex() and
|
||||
* getIndex(). It is for this reason that Normalizer does not implement the
|
||||
* CharacterIterator interface.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// Original filename in ICU4J: Normalizer.java
|
||||
public final class NormalizerBase implements Cloneable {
|
||||
|
||||
// The input text and our position in it
|
||||
private UCharacterIterator text;
|
||||
private Normalizer2 norm2;
|
||||
private Mode mode;
|
||||
private int options;
|
||||
|
||||
// The normalization buffer is the result of normalization
|
||||
// of the source in [currentIndex..nextIndex] .
|
||||
private int currentIndex;
|
||||
private int nextIndex;
|
||||
|
||||
// A buffer for holding intermediate results
|
||||
private StringBuilder buffer;
|
||||
private int bufferPos;
|
||||
|
||||
// Helper classes to defer loading of normalization data.
|
||||
private static final class ModeImpl {
|
||||
private ModeImpl(Normalizer2 n2) {
|
||||
normalizer2 = n2;
|
||||
}
|
||||
|
||||
private final Normalizer2 normalizer2;
|
||||
}
|
||||
|
||||
private static final class NFDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
|
||||
}
|
||||
|
||||
private static final class NFKDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
|
||||
}
|
||||
|
||||
private static final class NFCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
|
||||
}
|
||||
|
||||
private static final class NFKCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
|
||||
}
|
||||
|
||||
private static final class Unicode32 {
|
||||
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
|
||||
}
|
||||
|
||||
private static final class NFD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Options bit set value to select Unicode 3.2 normalization (except
|
||||
* NormalizationCorrections). At most one Unicode version can be selected at a
|
||||
* time.
|
||||
*
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int UNICODE_3_2 = 0x20;
|
||||
|
||||
public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2;
|
||||
|
||||
/*
|
||||
* Default option for the latest Unicode normalization. This option is provided
|
||||
* mainly for testing. The value zero means that normalization is done with the
|
||||
* fixes for - Corrigendum 4 (Five CJK Canonical Mapping Errors) - Corrigendum 5
|
||||
* (Normalization Idempotency)
|
||||
*/
|
||||
public static final int UNICODE_LATEST = 0x00;
|
||||
|
||||
/**
|
||||
* Constant indicating that the end of the iteration has been reached. This is
|
||||
* guaranteed to have the same value as {@link UCharacterIterator#DONE}.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int DONE = UCharacterIterator.DONE;
|
||||
|
||||
/**
|
||||
* Constants for normalization modes.
|
||||
* <p>
|
||||
* The Mode class is not intended for public subclassing. Only the Mode
|
||||
* constants provided by the Normalizer class should be used, and any fields or
|
||||
* methods should not be called or overridden by users.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public abstract static class Mode {
|
||||
|
||||
/**
|
||||
* Sole constructor
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected Mode() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected abstract Normalizer2 getNormalizer2(int options);
|
||||
}
|
||||
|
||||
private static Mode toMode(Normalizer.Form form) {
|
||||
switch (form) {
|
||||
case NFC:
|
||||
return NFC;
|
||||
case NFD:
|
||||
return NFD;
|
||||
case NFKC:
|
||||
return NFKC;
|
||||
case NFKD:
|
||||
return NFKD;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unexpected normalization form: " + form);
|
||||
}
|
||||
|
||||
private static final class NONEMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return Norm2AllModes.NOOP_NORMALIZER2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKD32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKC32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* No decomposition/composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NONE = new NONEMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFD = new NFDMode();
|
||||
|
||||
/**
|
||||
* Compatibility decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFKD = new NFKDMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition followed by canonical composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFC = new NFCMode();
|
||||
|
||||
public static final Mode NFKC = new NFKCMode();
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator constructors
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of a given string.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this object.
|
||||
* <p>
|
||||
*
|
||||
* @param str The string to be normalized. The normalization will start at the
|
||||
* beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(String str, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance(str);
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(String str, Mode mode) {
|
||||
this(str, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of the given text.
|
||||
* <p>
|
||||
*
|
||||
* @param iter The input text to be normalized. The normalization will start at
|
||||
* the beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance((CharacterIterator) iter.clone());
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode) {
|
||||
this(iter, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clones this {@code NormalizerBase} object. All properties of this object are
|
||||
* duplicated in the new object, including the cloning of any
|
||||
* {@link CharacterIterator} that was passed in to the constructor or to
|
||||
* {@link #setText(CharacterIterator) setText}. However, the text storage
|
||||
* underlying the {@code CharacterIterator} is not duplicated unless the
|
||||
* iterator's {@code clone} method does so.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Object clone() {
|
||||
try {
|
||||
NormalizerBase copy = (NormalizerBase) super.clone();
|
||||
copy.text = (UCharacterIterator) text.clone();
|
||||
copy.mode = mode;
|
||||
copy.options = options;
|
||||
copy.norm2 = norm2;
|
||||
copy.buffer = new StringBuilder(buffer);
|
||||
copy.bufferPos = bufferPos;
|
||||
copy.currentIndex = currentIndex;
|
||||
copy.nextIndex = nextIndex;
|
||||
return copy;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes a {@code String} using the given normalization operation.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this operation. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms, use 0 for
|
||||
* this argument.
|
||||
* <p>
|
||||
*
|
||||
* @param str the input string to be normalized.
|
||||
* @param mode the normalization mode
|
||||
* @param options the optional features to be enabled.
|
||||
* @return String the normalized string
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static String normalize(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).normalize(str);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.normalize(str, toMode(form), options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test if a string is in a given normalization form. This is semantically
|
||||
* equivalent to source.equals(normalize(source, mode)).
|
||||
*
|
||||
* Unlike quickCheck(), this function returns a definitive result, never a
|
||||
* "maybe". For NFD, NFKD, and FCD, both functions work exactly the same. For
|
||||
* NFC and NFKC where quickCheck may return "maybe", this function will perform
|
||||
* further tests to arrive at a true/false result.
|
||||
*
|
||||
* @param str the input string to be checked to see if it is normalized
|
||||
* @param mode the normalization mode
|
||||
* @param options Options for use with exclusion set and tailored Normalization
|
||||
* The only option that is currently recognized is UNICODE_3_2
|
||||
* @see #isNormalized
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static boolean isNormalized(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).isNormalized(str);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), options);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iteration API
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return the current character in the normalized text.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int current() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
return buffer.codePointAt(bufferPos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next character in the normalized text and advance the iteration
|
||||
* position by one. If the end of the text has already been reached,
|
||||
* {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int next() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
int c = buffer.codePointAt(bufferPos);
|
||||
bufferPos += Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the previous character in the normalized text and decrement the
|
||||
* iteration position by one. If the beginning of the text has already been
|
||||
* reached, {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int previous() {
|
||||
if (bufferPos > 0 || previousNormalize()) {
|
||||
int c = buffer.codePointBefore(bufferPos);
|
||||
bufferPos -= Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the index to the beginning of the text. This is equivalent to
|
||||
* setIndexOnly(startIndex)).
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void reset() {
|
||||
text.setIndex(0);
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized,
|
||||
* without any immediate normalization. After setIndexOnly(), getIndex() will
|
||||
* return the same index that is specified here.
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setIndexOnly(int index) {
|
||||
text.setIndex(index); // validates index
|
||||
currentIndex = nextIndex = index;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized and
|
||||
* return the first normalized character at that position.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em> text, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* normalized <em>output</em>. This means that there is not necessarily a
|
||||
* one-to-one correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
* <p>
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
*
|
||||
* @return the first normalized character that is the result of iterating
|
||||
* forward starting at the given index.
|
||||
*
|
||||
* @throws IllegalArgumentException if the given index is less than
|
||||
* {@link #getBeginIndex} or greater than
|
||||
* {@link #getEndIndex}. deprecated ICU 3.2
|
||||
* @obsolete ICU 3.2
|
||||
*/
|
||||
public int setIndex(int index) {
|
||||
setIndexOnly(index);
|
||||
return current();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the start of the input text. This is the begin index of
|
||||
* the {@code CharacterIterator} or the start (i.e. 0) of the {@code String}
|
||||
* over which this {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use startIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #startIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getBeginIndex() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use endIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #endIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getEndIndex() {
|
||||
return endIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the current iteration position in the input text that is being
|
||||
* normalized. This method is useful in applications such as searching, where
|
||||
* you need to be able to determine the position in the input text that
|
||||
* corresponds to a given normalized output character.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* <em>output</em>. This means that there is not necessarily a one-to-one
|
||||
* correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int getIndex() {
|
||||
if (bufferPos < buffer.length()) {
|
||||
return currentIndex;
|
||||
} else {
|
||||
return nextIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int endIndex() {
|
||||
return text.getLength();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator attributes
|
||||
// -------------------------------------------------------------------------
|
||||
/**
|
||||
* Set the normalization mode for this object.
|
||||
* <p>
|
||||
* <b>Note:</b>If the normalization mode is changed while iterating over a
|
||||
* string, calls to {@link #next} and {@link #previous} may return previously
|
||||
* buffers characters in the old normalization mode until the iteration is able
|
||||
* to re-sync at the next base character. It is safest to call {@link #setText
|
||||
* setText()}, {@link #first}, {@link #last}, etc. after calling
|
||||
* {@code setMode}.
|
||||
* <p>
|
||||
*
|
||||
* @param newMode the new mode for this {@code NormalizerBase}. The supported
|
||||
* modes are:
|
||||
* <ul>
|
||||
* <li>{@link #NFC} - Unicode canonical decompositiion followed
|
||||
* by canonical composition.
|
||||
* <li>{@link #NFKC} - Unicode compatibility decompositiion
|
||||
* follwed by canonical composition.
|
||||
* <li>{@link #NFD} - Unicode canonical decomposition
|
||||
* <li>{@link #NFKD} - Unicode compatibility decomposition.
|
||||
* <li>{@link #NONE} - Do nothing but return characters from the
|
||||
* underlying input text.
|
||||
* </ul>
|
||||
*
|
||||
* @see #getMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setMode(Mode newMode) {
|
||||
mode = newMode;
|
||||
norm2 = mode.getNormalizer2(options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the basic operation performed by this {@code NormalizerBase}
|
||||
*
|
||||
* @see #setMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Mode getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(String newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(CharacterIterator newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
private void clearBuffer() {
|
||||
buffer.setLength(0);
|
||||
bufferPos = 0;
|
||||
}
|
||||
|
||||
private boolean nextNormalize() {
|
||||
clearBuffer();
|
||||
currentIndex = nextIndex;
|
||||
text.setIndex(nextIndex);
|
||||
// Skip at least one character so we make progress.
|
||||
int c = text.nextCodePoint();
|
||||
if (c < 0) {
|
||||
return false;
|
||||
}
|
||||
StringBuilder segment = new StringBuilder().appendCodePoint(c);
|
||||
while ((c = text.nextCodePoint()) >= 0) {
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
text.moveCodePointIndex(-1);
|
||||
break;
|
||||
}
|
||||
segment.appendCodePoint(c);
|
||||
}
|
||||
nextIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
private boolean previousNormalize() {
|
||||
clearBuffer();
|
||||
nextIndex = currentIndex;
|
||||
text.setIndex(currentIndex);
|
||||
StringBuilder segment = new StringBuilder();
|
||||
int c;
|
||||
while ((c = text.previousCodePoint()) >= 0) {
|
||||
if (c <= 0xffff) {
|
||||
segment.insert(0, (char) c);
|
||||
} else {
|
||||
segment.insert(0, Character.toChars(c));
|
||||
}
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
bufferPos = buffer.length();
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
}
|
124
sources/main/java/jdk_internal/icu/text/Replaceable.java
Normal file
124
sources/main/java/jdk_internal/icu/text/Replaceable.java
Normal file
@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* <code>Replaceable</code> is an interface representing a string of characters
|
||||
* that supports the replacement of a range of itself with a new string of
|
||||
* characters. It is used by APIs that change a piece of text while retaining
|
||||
* metadata. Metadata is data other than the Unicode characters returned by
|
||||
* char32At(). One example of metadata is style attributes; another is an edit
|
||||
* history, marking each character with an author and revision number.
|
||||
*
|
||||
* <p>
|
||||
* An implicit aspect of the <code>Replaceable</code> API is that during a
|
||||
* replace operation, new characters take on the metadata of the old characters.
|
||||
* For example, if the string "the <b>bold</b> font" has range (4, 8) replaced
|
||||
* with "strong", then it becomes "the <b>strong</b> font".
|
||||
*
|
||||
* <p>
|
||||
* <code>Replaceable</code> specifies ranges using a start offset and a limit
|
||||
* offset. The range of characters thus specified includes the characters at
|
||||
* offset start..limit-1. That is, the start offset is inclusive, and the limit
|
||||
* offset is exclusive.
|
||||
*
|
||||
* <p>
|
||||
* <code>Replaceable</code> also includes API to access characters in the
|
||||
* string: <code>length()</code>, <code>charAt()</code>,
|
||||
* <code>char32At()</code>, and <code>extractBetween()</code>.
|
||||
*
|
||||
* <p>
|
||||
* For a subclass to support metadata, typical behavior of
|
||||
* <code>replace()</code> is the following:
|
||||
* <ul>
|
||||
* <li>Set the metadata of the new text to the metadata of the first character
|
||||
* replaced</li>
|
||||
* <li>If no characters are replaced, use the metadata of the previous
|
||||
* character</li>
|
||||
* <li>If there is no previous character (i.e. start == 0), use the following
|
||||
* character</li>
|
||||
* <li>If there is no following character (i.e. the replaceable was empty), use
|
||||
* default metadata</li>
|
||||
* <li>If the code point U+FFFF is seen, it should be interpreted as a special
|
||||
* marker having no metadata</li>
|
||||
* </ul>
|
||||
* If this is not the behavior, the subclass should document any differences.
|
||||
*
|
||||
* <p>
|
||||
* Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public interface Replaceable {
|
||||
/**
|
||||
* Returns the number of 16-bit code units in the text.
|
||||
*
|
||||
* @return number of 16-bit code units in text
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
int length();
|
||||
|
||||
/**
|
||||
* Returns the 16-bit code unit at the given offset into the text.
|
||||
*
|
||||
* @param offset an integer between 0 and <code>length()</code>-1 inclusive
|
||||
* @return 16-bit code unit of text at given offset
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
char charAt(int offset);
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination character array. The
|
||||
* first character to be copied is at index <code>srcStart</code>; the last
|
||||
* character to be copied is at index <code>srcLimit-1</code> (thus the total
|
||||
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code> starting at index
|
||||
* <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive;
|
||||
* {@code 0 <= start <= limit}.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* {@code start <= limit <= length()}.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
|
||||
}
|
121
sources/main/java/jdk_internal/icu/text/ReplaceableString.java
Normal file
121
sources/main/java/jdk_internal/icu/text/ReplaceableString.java
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* <code>ReplaceableString</code> is an adapter class that implements the
|
||||
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
|
||||
*
|
||||
* <p>
|
||||
* <em>Note:</em> This class does not support attributes and is not intended for
|
||||
* general use. Most clients will need to implement {@link Replaceable} in their
|
||||
* text representation class.
|
||||
*
|
||||
* <p>
|
||||
* Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
|
||||
private StringBuffer buf;
|
||||
|
||||
/**
|
||||
* Construct a new object with the given initial contents.
|
||||
*
|
||||
* @param str initial contents
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(String str) {
|
||||
buf = new StringBuffer(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new object using <code>buf</code> for internal storage. The
|
||||
* contents of <code>buf</code> at the time of construction are used as the
|
||||
* initial contents. <em>Note! Modifications to <code>buf</code> will modify
|
||||
* this object, and vice versa.</em>
|
||||
*
|
||||
* @param buf object to be used as internal storage
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(StringBuffer buf) {
|
||||
this.buf = buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int length() {
|
||||
return buf.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the character at the given position in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*
|
||||
* @param offset offset into the contents, from 0 to <code>length()</code> - 1
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public char charAt(int offset) {
|
||||
return buf.charAt(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination character array. The
|
||||
* first character to be copied is at index <code>srcStart</code>; the last
|
||||
* character to be copied is at index <code>srcLimit-1</code> (thus the total
|
||||
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code> starting at index
|
||||
* <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive;
|
||||
* {@code 0 <= start <= limit}.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* {@code start <= limit <= length()}.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
|
||||
if (srcStart != srcLimit) {
|
||||
buf.getChars(srcStart, srcLimit, dst, dstStart);
|
||||
}
|
||||
}
|
||||
}
|
493
sources/main/java/jdk_internal/icu/text/StringPrep.java
Normal file
493
sources/main/java/jdk_internal/icu/text/StringPrep.java
Normal file
@ -0,0 +1,493 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2004, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
//
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
|
||||
// - move from package com.ibm.icu.text to package sun.net.idn
|
||||
// - use ParseException instead of StringPrepParseException
|
||||
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
|
||||
// - remove all @deprecated tag to make compiler happy
|
||||
// 2007-08-14 Martin Buchholz
|
||||
// - remove redundant casts
|
||||
//
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import jdk_internal.bidi.Normalizer;
|
||||
import jdk_internal.bidi.ParseException;
|
||||
import jdk_internal.bidi.SunNormalizer;
|
||||
import jdk_internal.icu.impl.CharTrie;
|
||||
import jdk_internal.icu.impl.StringPrepDataReader;
|
||||
import jdk_internal.icu.impl.Trie;
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
import jdk_internal.icu.lang.UCharacterDirection;
|
||||
import jdk_internal.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* StringPrep API implements the StingPrep framework as described by
|
||||
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. StringPrep
|
||||
* prepares Unicode strings for use in network protocols. Profiles of StingPrep
|
||||
* are set of rules and data according to which the Unicode Strings are
|
||||
* prepared. Each profiles contains tables which describe how a code point
|
||||
* should be treated. The tables are broadly classied into
|
||||
* <ul>
|
||||
* <li>Unassigned Table: Contains code points that are unassigned in the Unicode
|
||||
* Version supported by StringPrep. Currently RFC 3454 supports Unicode 3.2.
|
||||
* </li>
|
||||
* <li>Prohibited Table: Contains code points that are prohibted from the output
|
||||
* of the StringPrep processing function.</li>
|
||||
* <li>Mapping Table: Contains code ponts that are deleted from the output or
|
||||
* case mapped.</li>
|
||||
* </ul>
|
||||
*
|
||||
* The procedure for preparing Unicode strings:
|
||||
* <ol>
|
||||
* <li>Map: For each character in the input, check if it has a mapping and, if
|
||||
* so, replace it with its mapping.</li>
|
||||
* <li>Normalize: Possibly normalize the result of step 1 using Unicode
|
||||
* normalization.</li>
|
||||
* <li>Prohibit: Check for any characters that are not allowed in the output. If
|
||||
* any are found, return an error.</li>
|
||||
* <li>Check bidi: Possibly check for right-to-left characters, and if any are
|
||||
* found, make sure that the whole string satisfies the requirements for
|
||||
* bidirectional strings. If the string does not satisfy the requirements for
|
||||
* bidirectional strings, return an error.</li>
|
||||
* </ol>
|
||||
*
|
||||
* @author Ram Viswanadha
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public final class StringPrep {
|
||||
/**
|
||||
* Option to prohibit processing of unassigned code points in the input
|
||||
*
|
||||
* @see #prepare
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public static final int DEFAULT = 0x0000;
|
||||
|
||||
/**
|
||||
* Option to allow processing of unassigned code points in the input
|
||||
*
|
||||
* @see #prepare
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public static final int ALLOW_UNASSIGNED = 0x0001;
|
||||
|
||||
private static final int UNASSIGNED = 0x0000;
|
||||
private static final int MAP = 0x0001;
|
||||
private static final int PROHIBITED = 0x0002;
|
||||
private static final int DELETE = 0x0003;
|
||||
private static final int TYPE_LIMIT = 0x0004;
|
||||
|
||||
private static final int NORMALIZATION_ON = 0x0001;
|
||||
private static final int CHECK_BIDI_ON = 0x0002;
|
||||
|
||||
private static final int TYPE_THRESHOLD = 0xFFF0;
|
||||
private static final int MAX_INDEX_VALUE = 0x3FBF; /* 16139 */
|
||||
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
|
||||
|
||||
/* indexes[] value names */
|
||||
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
|
||||
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
|
||||
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /*
|
||||
* The index of Unicode version of last entry in
|
||||
* NormalizationCorrections.txt
|
||||
*/
|
||||
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /*
|
||||
* The starting index of 1 UChar mapping index in the
|
||||
* mapping data array
|
||||
*/
|
||||
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /*
|
||||
* The starting index of 2 UChars mapping index in
|
||||
* the mapping data array
|
||||
*/
|
||||
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
|
||||
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
|
||||
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
|
||||
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
|
||||
|
||||
/**
|
||||
* Default buffer size of datafile
|
||||
*/
|
||||
private static final int DATA_BUFFER_SIZE = 25000;
|
||||
|
||||
/* Wrappers for Trie implementations */
|
||||
private static final class StringPrepTrieImpl implements Trie.DataManipulate {
|
||||
private CharTrie sprepTrie = null;
|
||||
|
||||
/**
|
||||
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's data the
|
||||
* index array offset of the indexes for that lead surrogate.
|
||||
*
|
||||
* @param property data value for a surrogate from the trie, including the
|
||||
* folding offset
|
||||
* @return data offset or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
public int getFoldingOffset(int value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// CharTrie implementation for reading the trie data
|
||||
private StringPrepTrieImpl sprepTrieImpl;
|
||||
// Indexes read from the data file
|
||||
private int[] indexes;
|
||||
// mapping data read from the data file
|
||||
private char[] mappingData;
|
||||
// format version of the data file
|
||||
private byte[] formatVersion;
|
||||
// the version of Unicode supported by the data file
|
||||
private VersionInfo sprepUniVer;
|
||||
// the Unicode version of last entry in the
|
||||
// NormalizationCorrections.txt file if normalization
|
||||
// is turned on
|
||||
private VersionInfo normCorrVer;
|
||||
// Option to turn on Normalization
|
||||
private boolean doNFKC;
|
||||
// Option to turn on checking for BiDi rules
|
||||
private boolean checkBiDi;
|
||||
|
||||
private char getCodePointValue(int ch) {
|
||||
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
|
||||
}
|
||||
|
||||
private static VersionInfo getVersionInfo(int comp) {
|
||||
int micro = comp & 0xFF;
|
||||
int milli = (comp >> 8) & 0xFF;
|
||||
int minor = (comp >> 16) & 0xFF;
|
||||
int major = (comp >> 24) & 0xFF;
|
||||
return VersionInfo.getInstance(major, minor, milli, micro);
|
||||
}
|
||||
|
||||
private static VersionInfo getVersionInfo(byte[] version) {
|
||||
if (version.length != 4) {
|
||||
return null;
|
||||
}
|
||||
return VersionInfo.getInstance((int) version[0], (int) version[1], (int) version[2], (int) version[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an StringPrep object after reading the input stream. The object does
|
||||
* not hold a reference to the input steam, so the stream can be closed after
|
||||
* the method returns.
|
||||
*
|
||||
* @param inputStream The stream for reading the StringPrep profile binarySun
|
||||
* @throws IOException
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public StringPrep(InputStream inputStream) throws IOException {
|
||||
|
||||
BufferedInputStream b = new BufferedInputStream(inputStream, DATA_BUFFER_SIZE);
|
||||
|
||||
StringPrepDataReader reader = new StringPrepDataReader(b);
|
||||
|
||||
// read the indexes
|
||||
indexes = reader.readIndexes(INDEX_TOP);
|
||||
|
||||
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
|
||||
|
||||
// indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
|
||||
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
|
||||
// load the rest of the data and initialize the data members
|
||||
reader.read(sprepBytes, mappingData);
|
||||
|
||||
sprepTrieImpl = new StringPrepTrieImpl();
|
||||
sprepTrieImpl.sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes), sprepTrieImpl);
|
||||
|
||||
// get the data format version
|
||||
formatVersion = reader.getDataFormatVersion();
|
||||
|
||||
// get the options
|
||||
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
|
||||
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
|
||||
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
|
||||
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
|
||||
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
|
||||
if (normUniVer.compareTo(sprepUniVer) < 0 && /*
|
||||
* the Unicode version of SPREP file must be less than the
|
||||
* Unicode Vesion of the normalization data
|
||||
*/
|
||||
normUniVer.compareTo(normCorrVer) < 0
|
||||
&& /*
|
||||
* the Unicode version of the NormalizationCorrections.txt file should be less
|
||||
* than the Unicode Vesion of the normalization data
|
||||
*/
|
||||
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on */
|
||||
) {
|
||||
throw new IOException("Normalization Correction version not supported");
|
||||
}
|
||||
b.close();
|
||||
}
|
||||
|
||||
private static final class Values {
|
||||
boolean isIndex;
|
||||
int value;
|
||||
int type;
|
||||
|
||||
public void reset() {
|
||||
isIndex = false;
|
||||
value = 0;
|
||||
type = -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static final void getValues(char trieWord, Values values) {
|
||||
values.reset();
|
||||
if (trieWord == 0) {
|
||||
/*
|
||||
* Initial value stored in the mapping table just return TYPE_LIMIT .. so that
|
||||
* the source codepoint is copied to the destination
|
||||
*/
|
||||
values.type = TYPE_LIMIT;
|
||||
} else if (trieWord >= TYPE_THRESHOLD) {
|
||||
values.type = (trieWord - TYPE_THRESHOLD);
|
||||
} else {
|
||||
/* get the type */
|
||||
values.type = MAP;
|
||||
/* ascertain if the value is index or delta */
|
||||
if ((trieWord & 0x02) > 0) {
|
||||
values.isIndex = true;
|
||||
values.value = trieWord >> 2; // mask off the lower 2 bits and shift
|
||||
|
||||
} else {
|
||||
values.isIndex = false;
|
||||
values.value = (trieWord << 16) >> 16;
|
||||
values.value = (values.value >> 2);
|
||||
|
||||
}
|
||||
|
||||
if ((trieWord >> 2) == MAX_INDEX_VALUE) {
|
||||
values.type = DELETE;
|
||||
values.isIndex = false;
|
||||
values.value = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private StringBuffer map(UCharacterIterator iter, int options) throws ParseException {
|
||||
|
||||
Values val = new Values();
|
||||
char result = 0;
|
||||
int ch = UCharacterIterator.DONE;
|
||||
StringBuffer dest = new StringBuffer();
|
||||
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
|
||||
|
||||
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
|
||||
|
||||
result = getCodePointValue(ch);
|
||||
getValues(result, val);
|
||||
|
||||
// check if the source codepoint is unassigned
|
||||
if (val.type == UNASSIGNED && allowUnassigned == false) {
|
||||
throw new ParseException("An unassigned code point was found in the input " + iter.getText(),
|
||||
iter.getIndex());
|
||||
} else if ((val.type == MAP)) {
|
||||
int index, length;
|
||||
|
||||
if (val.isIndex) {
|
||||
index = val.value;
|
||||
if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
|
||||
&& index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 1;
|
||||
} else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
|
||||
&& index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 2;
|
||||
} else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
|
||||
&& index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 3;
|
||||
} else {
|
||||
length = mappingData[index++];
|
||||
}
|
||||
/* copy mapping to destination */
|
||||
dest.append(mappingData, index, length);
|
||||
continue;
|
||||
|
||||
} else {
|
||||
ch -= val.value;
|
||||
}
|
||||
} else if (val.type == DELETE) {
|
||||
// just consume the codepoint and contine
|
||||
continue;
|
||||
}
|
||||
// copy the source into destination
|
||||
UTF16.append(dest, ch);
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
private StringBuffer normalize(StringBuffer src) {
|
||||
/*
|
||||
* Option UNORM_BEFORE_PRI_29:
|
||||
*
|
||||
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
|
||||
* requires strict adherence to Unicode 3.2 normalization, including buggy
|
||||
* composition from before fixing Public Review Issue #29. Note that this
|
||||
* results in some valid but nonsensical text to be either corrupted or
|
||||
* rejected, depending on the text. See
|
||||
* http://www.unicode.org/review/resolved-pri.html#pri29 See unorm.cpp and
|
||||
* cnormtst.c
|
||||
*/
|
||||
return new StringBuffer(
|
||||
SunNormalizer.normalize(src.toString(), Normalizer.Form.NFKC, SunNormalizer.UNICODE_3_2));
|
||||
}
|
||||
|
||||
/*
|
||||
* boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if(
|
||||
* (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; }
|
||||
*/
|
||||
/*
|
||||
* 1) Map -- For each character in the input, check if it has a mapping and, if
|
||||
* so, replace it with its mapping.
|
||||
*
|
||||
* 2) Normalize -- Possibly normalize the result of step 1 using Unicode
|
||||
* normalization.
|
||||
*
|
||||
* 3) Prohibit -- Check for any characters that are not allowed in the output.
|
||||
* If any are found, return an error.
|
||||
*
|
||||
* 4) Check bidi -- Possibly check for right-to-left characters, and if any are
|
||||
* found, make sure that the whole string satisfies the requirements for
|
||||
* bidirectional strings. If the string does not satisfy the requirements for
|
||||
* bidirectional strings, return an error. [Unicode3.2] defines several
|
||||
* bidirectional categories; each character has one bidirectional category
|
||||
* assigned to it. For the purposes of the requirements below, an
|
||||
* "RandALCat character" is a character that has Unicode bidirectional
|
||||
* categories "R" or "AL"; an "LCat character" is a character that has Unicode
|
||||
* bidirectional category "L". Note
|
||||
*
|
||||
*
|
||||
* that there are many characters which fall in neither of the above
|
||||
* definitions; Latin digits (<U+0030> through <U+0039>) are examples of this
|
||||
* because they have bidirectional category "EN".
|
||||
*
|
||||
* In any profile that specifies bidirectional character handling, all three of
|
||||
* the following requirements MUST be met:
|
||||
*
|
||||
* 1) The characters in section 5.8 MUST be prohibited.
|
||||
*
|
||||
* 2) If a string contains any RandALCat character, the string MUST NOT contain
|
||||
* any LCat character.
|
||||
*
|
||||
* 3) If a string contains any RandALCat character, a RandALCat character MUST
|
||||
* be the first character of the string, and a RandALCat character MUST be the
|
||||
* last character of the string.
|
||||
*/
|
||||
/**
|
||||
* Prepare the input buffer for use in applications with the given profile. This
|
||||
* operation maps, normalizes(NFKC), checks for prohited and BiDi characters in
|
||||
* the order defined by RFC 3454 depending on the options specified in the
|
||||
* profile.
|
||||
*
|
||||
* @param src A UCharacterIterator object containing the source string
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - StringPrep.NONE Prohibit processing of unassigned code
|
||||
* points in the input
|
||||
*
|
||||
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points
|
||||
* are in the input as normal Unicode code points.
|
||||
*
|
||||
* @return StringBuffer A StringBuffer containing the output
|
||||
* @throws ParseException
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public StringBuffer prepare(UCharacterIterator src, int options) throws ParseException {
|
||||
|
||||
// map
|
||||
StringBuffer mapOut = map(src, options);
|
||||
StringBuffer normOut = mapOut;// initialize
|
||||
|
||||
if (doNFKC) {
|
||||
// normalize
|
||||
normOut = normalize(mapOut);
|
||||
}
|
||||
|
||||
int ch;
|
||||
char result;
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
|
||||
Values val = new Values();
|
||||
int direction = UCharacterDirection.CHAR_DIRECTION_COUNT,
|
||||
firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
|
||||
int rtlPos = -1, ltrPos = -1;
|
||||
boolean rightToLeft = false, leftToRight = false;
|
||||
|
||||
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
|
||||
result = getCodePointValue(ch);
|
||||
getValues(result, val);
|
||||
|
||||
if (val.type == PROHIBITED) {
|
||||
throw new ParseException("A prohibited code point was found in the input" + iter.getText(), val.value);
|
||||
}
|
||||
|
||||
direction = UCharacter.getDirection(ch);
|
||||
if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
|
||||
firstCharDir = direction;
|
||||
}
|
||||
if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
|
||||
leftToRight = true;
|
||||
ltrPos = iter.getIndex() - 1;
|
||||
}
|
||||
if (direction == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
|
||||
rightToLeft = true;
|
||||
rtlPos = iter.getIndex() - 1;
|
||||
}
|
||||
}
|
||||
if (checkBiDi == true) {
|
||||
// satisfy 2
|
||||
if (leftToRight == true && rightToLeft == true) {
|
||||
throw new ParseException(
|
||||
"The input does not conform to the rules for BiDi code points." + iter.getText(),
|
||||
(rtlPos > ltrPos) ? rtlPos : ltrPos);
|
||||
}
|
||||
|
||||
// satisfy 3
|
||||
if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
|
||||
&& (direction == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
|
||||
throw new ParseException(
|
||||
"The input does not conform to the rules for BiDi code points." + iter.getText(),
|
||||
(rtlPos > ltrPos) ? rtlPos : ltrPos);
|
||||
}
|
||||
}
|
||||
return normOut;
|
||||
|
||||
}
|
||||
}
|
326
sources/main/java/jdk_internal/icu/text/UCharacterIterator.java
Normal file
326
sources/main/java/jdk_internal/icu/text/UCharacterIterator.java
Normal file
@ -0,0 +1,326 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.icu.impl.CharacterIteratorWrapper;
|
||||
import jdk_internal.icu.impl.ReplaceableUCharacterIterator;
|
||||
import jdk_internal.icu.impl.UCharacterProperty;
|
||||
|
||||
/**
|
||||
* Abstract class that defines an API for iteration on text objects.This is an
|
||||
* interface for forward and backward iteration and random access into a text
|
||||
* object. Forward iteration is done with post-increment and backward iteration
|
||||
* is done with pre-decrement semantics, while the
|
||||
* <code>java.text.CharacterIterator</code> interface methods provided forward
|
||||
* iteration with "pre-increment" and backward iteration with pre-decrement
|
||||
* semantics. This API is more efficient for forward iteration over code points.
|
||||
* The other major difference is that this API can do both code unit and code
|
||||
* point iteration, <code>java.text.CharacterIterator</code> can only iterate
|
||||
* over code units and is limited to BMP (0 - 0xFFFF)
|
||||
*
|
||||
* @author Ram
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract class UCharacterIterator implements Cloneable {
|
||||
|
||||
/**
|
||||
* Protected default constructor for the subclasses
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
protected UCharacterIterator() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text. Moved from
|
||||
* UForwardCharacterIterator.java
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
// static final methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a source string.
|
||||
*
|
||||
* @param source a string
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(String source) {
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a source StringBuffer.
|
||||
*
|
||||
* @param source an string buffer of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(StringBuffer source) {
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a CharacterIterator.
|
||||
*
|
||||
* @param source a valid CharacterIterator object.
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(CharacterIterator source) {
|
||||
return new CharacterIteratorWrapper(source);
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
*
|
||||
* @return length of the text
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getLength();
|
||||
|
||||
/**
|
||||
* Gets the current index in text.
|
||||
*
|
||||
* @return current index in text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getIndex();
|
||||
|
||||
/**
|
||||
* Returns the UTF16 code unit at index, and increments to the next code unit
|
||||
* (post-increment semantics). If index is out of range, DONE is returned, and
|
||||
* the iterator is reset to the limit of the text.
|
||||
*
|
||||
* @return the next UTF16 code unit, or DONE if the index is at the limit of the
|
||||
* text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int next();
|
||||
|
||||
/**
|
||||
* Returns the code point at index, and increments to the next code point
|
||||
* (post-increment semantics). If index does not point to a valid surrogate
|
||||
* pair, the behavior is the same as <code>next()</code>. Otherwise the iterator
|
||||
* is incremented past the surrogate pair, and the code point represented by the
|
||||
* pair is returned.
|
||||
*
|
||||
* @return the next codepoint in text, or DONE if the index is at the limit of
|
||||
* the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public int nextCodePoint() {
|
||||
int ch1 = next();
|
||||
if (UTF16.isLeadSurrogate((char) ch1)) {
|
||||
int ch2 = next();
|
||||
if (UTF16.isTrailSurrogate((char) ch2)) {
|
||||
return UCharacterProperty.getRawSupplementary((char) ch1, (char) ch2);
|
||||
} else if (ch2 != DONE) {
|
||||
// unmatched surrogate so back out
|
||||
previous();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decrement to the position of the previous code unit in the text, and return
|
||||
* it (pre-decrement semantics). If the resulting index is less than 0, the
|
||||
* index is reset to 0 and DONE is returned.
|
||||
*
|
||||
* @return the previous code unit in the text, or DONE if the new index is
|
||||
* before the start of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int previous();
|
||||
|
||||
/**
|
||||
* Retreat to the start of the previous code point in the text, and return it
|
||||
* (pre-decrement semantics). If the index is not preceeded by a valid surrogate
|
||||
* pair, the behavior is the same as <code>previous()</code>. Otherwise the
|
||||
* iterator is decremented to the start of the surrogate pair, and the code
|
||||
* point represented by the pair is returned.
|
||||
*
|
||||
* @return the previous code point in the text, or DONE if the new index is
|
||||
* before the start of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public int previousCodePoint() {
|
||||
int ch1 = previous();
|
||||
if (UTF16.isTrailSurrogate((char) ch1)) {
|
||||
int ch2 = previous();
|
||||
if (UTF16.isLeadSurrogate((char) ch2)) {
|
||||
return UCharacterProperty.getRawSupplementary((char) ch2, (char) ch1);
|
||||
} else if (ch2 != DONE) {
|
||||
// unmatched trail surrogate so back out
|
||||
next();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the index to the specified index in the text.
|
||||
*
|
||||
* @param index the index within the text.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract void setIndex(int index);
|
||||
|
||||
/**
|
||||
* Sets the current index to the start.
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public void setToStart() {
|
||||
setIndex(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the buffer with the underlying text storage of the iterator If the
|
||||
* buffer capacity is not enough a exception is thrown. The capacity of the fill
|
||||
* in buffer should at least be equal to length of text in the iterator obtained
|
||||
* by calling <code>getLength()</code>. <b>Usage:</b>
|
||||
*
|
||||
* <pre>{@code
|
||||
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
|
||||
* char[] buf = new char[iter.getLength()];
|
||||
* iter.getText(buf);
|
||||
*
|
||||
* OR
|
||||
* char[] buf= new char[1];
|
||||
* int len = 0;
|
||||
* for(;;){
|
||||
* try{
|
||||
* len = iter.getText(buf);
|
||||
* break;
|
||||
* }catch(IndexOutOfBoundsException e){
|
||||
* buf = new char[iter.getLength()];
|
||||
* }
|
||||
* }
|
||||
* }</pre>
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @param offset the position within the array to start putting the data.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBoundsException exception if there is not enough room
|
||||
* after offset in the array, or if offset
|
||||
* < 0.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getText(char[] fillIn, int offset);
|
||||
|
||||
/**
|
||||
* Convenience override for <code>getText(char[], int)</code> that provides an
|
||||
* offset of 0.
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBoundsException exception if there is not enough room in
|
||||
* the array.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public final int getText(char[] fillIn) {
|
||||
return getText(fillIn, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for returning the underlying text storage as a string
|
||||
*
|
||||
* @return the underlying text storage in the iterator as a string
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public String getText() {
|
||||
char[] text = new char[getLength()];
|
||||
getText(text);
|
||||
return new String(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by the number of code points specified, either
|
||||
* forward or backward depending on the sign of delta (positive or negative
|
||||
* respectively). If the current index is at a trail surrogate then the first
|
||||
* adjustment is by code unit, and the remaining adjustments are by code points.
|
||||
* If the resulting index would be less than zero, the index is set to zero, and
|
||||
* if the resulting index would be greater than limit, the index is set to
|
||||
* limit.
|
||||
*
|
||||
* @param delta the number of code units to move the current index.
|
||||
* @return the new index
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*
|
||||
*/
|
||||
public int moveCodePointIndex(int delta) {
|
||||
if (delta > 0) {
|
||||
while (delta > 0 && nextCodePoint() != DONE) {
|
||||
delta--;
|
||||
}
|
||||
} else {
|
||||
while (delta < 0 && previousCodePoint() != DONE) {
|
||||
delta++;
|
||||
}
|
||||
}
|
||||
if (delta != 0) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
|
||||
return getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, independent from other iterators. If it is
|
||||
* not possible to clone the iterator, returns null.
|
||||
*
|
||||
* @return copy of this iterator
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
}
|
609
sources/main/java/jdk_internal/icu/text/UTF16.java
Normal file
609
sources/main/java/jdk_internal/icu/text/UTF16.java
Normal file
@ -0,0 +1,609 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.impl.UCharacterProperty;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Standalone utility class providing UTF16 character conversions and indexing
|
||||
* conversions.
|
||||
* <p>
|
||||
* Code that uses strings alone rarely need modification. By design, UTF-16 does
|
||||
* not allow overlap, so searching for strings is a safe operation. Similarly,
|
||||
* concatenation is always safe. Substringing is safe if the start and end are
|
||||
* both on UTF-32 boundaries. In normal code, the values for start and end are
|
||||
* on those boundaries, since they arose from operations like searching. If not,
|
||||
* the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
|
||||
* <strong>Examples:</strong>
|
||||
* <p>
|
||||
* The following examples illustrate use of some of these methods.
|
||||
*
|
||||
* <pre>{@code
|
||||
* // iteration forwards: Original
|
||||
* for (int i = 0; i < s.length(); ++i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration forwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s, i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Original
|
||||
* for (int i = s.length() - 1; i >= 0; --i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s, i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
* }</pre>
|
||||
*
|
||||
* <strong>Notes:</strong>
|
||||
* <ul>
|
||||
* <li><strong>Naming:</strong> For clarity, High and Low surrogates are called
|
||||
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
|
||||
* sense of their ordering in a string. <code>offset16</code> and
|
||||
* <code>offset32</code> are used to distinguish offsets to UTF-16 boundaries vs
|
||||
* offsets to UTF-32 boundaries. <code>int char32</code> is used to contain
|
||||
* UTF-32 characters, as opposed to <code>char16</code>, which is a UTF-16 code
|
||||
* unit.</li>
|
||||
* <li><strong>Roundtripping Offsets:</strong> You can always roundtrip from a
|
||||
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
|
||||
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and back
|
||||
* if and only if <code>bounds(string, offset16) != TRAIL</code>.</li>
|
||||
* <li><strong>Exceptions:</strong> The error checking will throw an exception
|
||||
* if indices are out of bounds. Other than that, all methods will behave
|
||||
* reasonably, even if unmatched surrogates or out-of-bounds UTF-32 values are
|
||||
* present. <code>UCharacter.isLegal()</code> can be used to check for validity
|
||||
* if desired.</li>
|
||||
* <li><strong>Unmatched Surrogates:</strong> If the string contains unmatched
|
||||
* surrogates, then these are counted as one UTF-32 value. This matches their
|
||||
* iteration behavior, which is vital. It also matches common display practice
|
||||
* as missing glyphs (see the Unicode Standard Section 5.4, 5.5).</li>
|
||||
* <li><strong>Optimization:</strong> The method implementations may need
|
||||
* optimization if the compiler doesn't fold static final methods. Since
|
||||
* surrogate pairs will form an exceeding small percentage of all the text in
|
||||
* the world, the singleton case should always be optimized for.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @author Mark Davis, with help from Markus Scherer
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
|
||||
public final class UTF16 {
|
||||
// public variables ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* The lowest Unicode code point value.
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MIN_VALUE = 0;
|
||||
/**
|
||||
* The highest Unicode code point value (scalar value) according to the Unicode
|
||||
* Standard.
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
|
||||
/**
|
||||
* The minimum value for Supplementary code points
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
|
||||
/**
|
||||
* Lead surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
|
||||
/**
|
||||
* Trail surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
|
||||
/**
|
||||
* Lead surrogate maximum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
|
||||
/**
|
||||
* Trail surrogate maximum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
|
||||
/**
|
||||
* Surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
|
||||
/**
|
||||
* Lead surrogate bitmask
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
|
||||
/**
|
||||
* Trail surrogate bitmask
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
|
||||
/**
|
||||
* Surrogate bitmask
|
||||
*/
|
||||
private static final int SURROGATE_BITMASK = 0xFFFFF800;
|
||||
/**
|
||||
* Lead surrogate bits
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_BITS = 0xD800;
|
||||
/**
|
||||
* Trail surrogate bits
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
|
||||
/**
|
||||
* Surrogate bits
|
||||
*/
|
||||
private static final int SURROGATE_BITS = 0xD800;
|
||||
|
||||
// constructor --------------------------------------------------------
|
||||
|
||||
// /CLOVER:OFF
|
||||
/**
|
||||
* Prevent instance from being created.
|
||||
*/
|
||||
private UTF16() {
|
||||
}
|
||||
|
||||
// /CLOVER:ON
|
||||
// public method ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string. Used when iterating forwards or
|
||||
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">
|
||||
* UCharacter.isLegal()</a></code> on the return value. If the char retrieved is
|
||||
* part of a surrogate pair, its supplementary character will be returned. If a
|
||||
* complete supplementary character is not found the incomplete character will
|
||||
* be returned
|
||||
*
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(String source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (single < LEAD_SURROGATE_MIN_VALUE) {
|
||||
return single;
|
||||
}
|
||||
return _charAt(source, offset16, single);
|
||||
}
|
||||
|
||||
private static int _charAt(String source, int offset16, char single) {
|
||||
if (single > TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
++offset16;
|
||||
if (source.length() != offset16) {
|
||||
char trail = source.charAt(offset16);
|
||||
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
--offset16;
|
||||
if (offset16 >= 0) {
|
||||
// single is a trail surrogate so
|
||||
char lead = source.charAt(offset16);
|
||||
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
}
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string. Used when iterating forwards or
|
||||
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
|
||||
* </a></code> on the return value. If the char retrieved is part of a surrogate
|
||||
* pair, its supplementary character will be returned. If a complete
|
||||
* supplementary character is not found the incomplete character will be
|
||||
* returned
|
||||
*
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(CharSequence source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
return single;
|
||||
}
|
||||
return _charAt(source, offset16, single);
|
||||
}
|
||||
|
||||
private static int _charAt(CharSequence source, int offset16, char single) {
|
||||
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
|
||||
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
++offset16;
|
||||
if (source.length() != offset16) {
|
||||
char trail = source.charAt(offset16);
|
||||
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
--offset16;
|
||||
if (offset16 >= 0) {
|
||||
// single is a trail surrogate so
|
||||
char lead = source.charAt(offset16);
|
||||
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
}
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a substring. Used when iterating forwards
|
||||
* or backwards (with <code>UTF16.getCharCount()</code>, as well as random
|
||||
* access. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
|
||||
* </a></code> on the return value. If the char retrieved is part of a surrogate
|
||||
* pair, its supplementary character will be returned. If a complete
|
||||
* supplementary character is not found the incomplete character will be
|
||||
* returned
|
||||
*
|
||||
* @param source Array of UTF-16 chars
|
||||
* @param start Offset to substring in the source array for analyzing
|
||||
* @param limit Offset to substring in the source array for analyzing
|
||||
* @param offset16 UTF-16 offset relative to start
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the
|
||||
* range of start and limit.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(char source[], int start, int limit, int offset16) {
|
||||
offset16 += start;
|
||||
if (offset16 < start || offset16 >= limit) {
|
||||
throw new ArrayIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
|
||||
char single = source[offset16];
|
||||
if (!isSurrogate(single)) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
offset16++;
|
||||
if (offset16 >= limit) {
|
||||
return single;
|
||||
}
|
||||
char trail = source[offset16];
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
} else { // isTrailSurrogate(single), so
|
||||
if (offset16 == start) {
|
||||
return single;
|
||||
}
|
||||
offset16--;
|
||||
char lead = source[offset16];
|
||||
if (isLeadSurrogate(lead))
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how many chars this char32 requires. If a validity check is
|
||||
* required, use <code>
|
||||
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input codepoint.
|
||||
* @return 2 if is in supplementary space, otherwise 1.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCharCount(int char32) {
|
||||
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return 1;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code value is a surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isSurrogate(char char16) {
|
||||
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a trail surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a trail surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isTrailSurrogate(char char16) {
|
||||
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a lead surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a lead surrogate
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isLeadSurrogate(char char16) {
|
||||
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the lead surrogate. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return lead surrogate if the getCharCount(ch) is 2; <br>
|
||||
* and 0 otherwise (note: 0 is not a valid lead surrogate).
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getLeadSurrogate(int char32) {
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the trail surrogate. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return the trail surrogate if the getCharCount(ch) is 2; <br>
|
||||
* otherwise the character itself
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getTrailSurrogate(int char32) {
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
return (char) char32;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method corresponding to String.valueOf(char). Returns a one or
|
||||
* two char string containing the UTF-32 value in UTF16 format. If a validity
|
||||
* check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return string value of char32 in UTF16 format
|
||||
* @exception IllegalArgumentException thrown if char32 is a invalid codepoint.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static String valueOf(int char32) {
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint");
|
||||
}
|
||||
return toString(char32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a single UTF-32 value to the end of a StringBuffer. If a validity
|
||||
* check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param target the buffer to append to
|
||||
* @param char32 value to append.
|
||||
* @return the updated StringBuffer
|
||||
* @exception IllegalArgumentException thrown when char32 does not lie within
|
||||
* the range of the Unicode codepoints
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static StringBuffer append(StringBuffer target, int char32) {
|
||||
// Check for irregular values
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
|
||||
}
|
||||
|
||||
// Write the UTF-16 values
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
target.append(getLeadSurrogate(char32));
|
||||
target.append(getTrailSurrogate(char32));
|
||||
} else {
|
||||
target.append((char) char32);
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shifts offset16 by the argument number of codepoints within a subarray.
|
||||
*
|
||||
* @param source char array
|
||||
* @param start position of the subarray to be performed on
|
||||
* @param limit position of the subarray to be performed on
|
||||
* @param offset16 UTF16 position to shift relative to start
|
||||
* @param shift32 number of codepoints to shift
|
||||
* @return new shifted offset16 relative to start
|
||||
* @exception IndexOutOfBoundsException if the new offset16 is out of bounds
|
||||
* with respect to the subarray or the
|
||||
* subarray bounds are out of range.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) {
|
||||
int size = source.length;
|
||||
int count;
|
||||
char ch;
|
||||
int result = offset16 + start;
|
||||
if (start < 0 || limit < start) {
|
||||
throw new StringIndexOutOfBoundsException(start);
|
||||
}
|
||||
if (limit > size) {
|
||||
throw new StringIndexOutOfBoundsException(limit);
|
||||
}
|
||||
if (offset16 < 0 || result > limit) {
|
||||
throw new StringIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
if (shift32 > 0) {
|
||||
if (shift32 + result > size) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
count = shift32;
|
||||
while (result < limit && count > 0) {
|
||||
ch = source[result];
|
||||
if (isLeadSurrogate(ch) && (result + 1 < limit) && isTrailSurrogate(source[result + 1])) {
|
||||
result++;
|
||||
}
|
||||
count--;
|
||||
result++;
|
||||
}
|
||||
} else {
|
||||
if (result + shift32 < start) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
for (count = -shift32; count > 0; count--) {
|
||||
result--;
|
||||
if (result < start) {
|
||||
break;
|
||||
}
|
||||
ch = source[result];
|
||||
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
|
||||
result--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
throw new StringIndexOutOfBoundsException(shift32);
|
||||
}
|
||||
result -= start;
|
||||
return result;
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Shift value for lead surrogate to form a supplementary character.
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_SHIFT_ = 10;
|
||||
|
||||
/**
|
||||
* Mask to retrieve the significant value from a trail surrogate.
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
|
||||
|
||||
/**
|
||||
* Value that all lead surrogate starts with
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
|
||||
- (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
|
||||
|
||||
// private methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Converts argument code point and returns a String object representing the
|
||||
* code point's value in UTF16 format.
|
||||
* <p>
|
||||
* This method does not check for the validity of the codepoint, the results are
|
||||
* not guaranteed if a invalid codepoint is passed as argument.
|
||||
* <p>
|
||||
* The result is a string whose length is 1 for non-supplementary code points, 2
|
||||
* otherwise.
|
||||
*
|
||||
* @param ch code point
|
||||
* @return string representation of the code point
|
||||
*/
|
||||
private static String toString(int ch) {
|
||||
if (ch < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return String.valueOf((char) ch);
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder();
|
||||
result.append(getLeadSurrogate(ch));
|
||||
result.append(getTrailSurrogate(ch));
|
||||
return result.toString();
|
||||
}
|
||||
}
|
1515
sources/main/java/jdk_internal/icu/text/UnicodeSet.java
Normal file
1515
sources/main/java/jdk_internal/icu/text/UnicodeSet.java
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user