Update #47 - Singleplayer lag fixes

This commit is contained in:
lax1dude
2025-01-19 13:26:27 -08:00
parent 3f5ee57068
commit 1f0d593a8c
2052 changed files with 133581 additions and 2339 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,821 @@
/*
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package jdk_internal.bidi.icu.text;
import java.util.Arrays;
import jdk_internal.bidi.Bidi;
final class BidiLine {
/*
* General remarks about the functions in this file:
*
* These functions deal with the aspects of potentially mixed-directional text
* in a single paragraph or in a line of a single paragraph which has already
* been processed according to the Unicode 3.0 Bidi algorithm as defined in <a
* href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9: Unicode
* Bidirectional Algorithm</a>, version 13, also described in The Unicode
* Standard, Version 4.0.1 .
*
* This means that there is a Bidi object with a levels and a dirProps array.
* paraLevel and direction are also set. Only if the length of the text is zero,
* then levels==dirProps==NULL.
*
* The overall directionality of the paragraph or line is used to bypass the
* reordering steps if possible. Even purely RTL text does not need reordering
* there because the getLogical/VisualIndex() methods can compute the index on
* the fly in such a case.
*
* The implementation of the access to same-level-runs and of the reordering do
* attempt to provide better performance and less memory usage compared to a
* direct implementation of especially rule (L2) with an array of one (32-bit)
* integer per text character.
*
* Here, the levels array is scanned as soon as necessary, and a vector of
* same-level-runs is created. Reordering then is done on this vector. For each
* run of text positions that were resolved to the same level, only 8 bytes are
* stored: the first text position of the run and the visual position behind the
* run after reordering. One sign bit is used to hold the directionality of the
* run. This is inefficient if there are many very short runs. If the average
* run length is <2, then this uses more memory.
*
* In a further attempt to save memory, the levels array is never changed after
* all the resolution rules (Xn, Wn, Nn, In). Many methods have to consider the
* field trailingWSStart: if it is less than length, then there is an implicit
* trailing run at the paraLevel, which is not reflected in the levels array.
* This allows a line Bidi object to use the same levels array as its paragraph
* parent object.
*
* When a Bidi object is created for a line of a paragraph, then the paragraph's
* levels and dirProps arrays are reused by way of setting a pointer into them,
* not by copying. This again saves memory and forbids to change the now shared
* levels for (L1).
*/
/* handle trailing WS (L1) -------------------------------------------------- */
/*
* setTrailingWSStart() sets the start index for a trailing run of WS in the
* line. This is necessary because we do not modify the paragraph's levels array
* that we just point into. Using trailingWSStart is another form of performing
* (L1).
*
* To make subsequent operations easier, we also include the run before the WS
* if it is at the paraLevel - we merge the two here.
*
* This method is called only from setLine(), so paraLevel is set correctly for
* the line even when contextual multiple paragraphs.
*/
static void setTrailingWSStart(BidiBase bidiBase) {
byte[] dirProps = bidiBase.dirProps;
byte[] levels = bidiBase.levels;
int start = bidiBase.length;
byte paraLevel = bidiBase.paraLevel;
/*
* If the line is terminated by a block separator, all preceding WS etc... are
* already set to paragraph level. Setting trailingWSStart to pBidi->length will
* avoid changing the level of B chars from 0 to paraLevel in getLevels when
* orderParagraphsLTR==TRUE
*/
if (dirProps[start - 1] == BidiBase.B) {
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
return;
}
/* go backwards across all WS, BN, explicit codes */
while (start > 0 && (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
--start;
}
/* if the WS run can be merged with the previous run then do so here */
while (start > 0 && levels[start - 1] == paraLevel) {
--start;
}
bidiBase.trailingWSStart = start;
}
static Bidi setLine(BidiBase paraBidi, Bidi newBidi, BidiBase lineBidi, int start, int limit) {
int length;
/* set the values in lineBidi from its paraBidi parent */
/* class members are already initialized to 0 */
// lineBidi.paraBidi = null; /* mark unfinished setLine */
// lineBidi.flags = 0;
// lineBidi.controlCount = 0;
length = lineBidi.length = lineBidi.originalLength = lineBidi.resultLength = limit - start;
lineBidi.text = new char[length];
System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length);
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
lineBidi.paraCount = paraBidi.paraCount;
lineBidi.runs = new BidiRun[0];
lineBidi.reorderingMode = paraBidi.reorderingMode;
lineBidi.reorderingOptions = paraBidi.reorderingOptions;
if (paraBidi.controlCount > 0) {
int j;
for (j = start; j < limit; j++) {
if (BidiBase.IsBidiControlChar(paraBidi.text[j])) {
lineBidi.controlCount++;
}
}
lineBidi.resultLength -= lineBidi.controlCount;
}
/* copy proper subset of DirProps */
lineBidi.getDirPropsMemory(length);
lineBidi.dirProps = lineBidi.dirPropsMemory;
System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0, length);
/* copy proper subset of Levels */
lineBidi.getLevelsMemory(length);
lineBidi.levels = lineBidi.levelsMemory;
System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0, length);
lineBidi.runCount = -1;
if (paraBidi.direction != BidiBase.MIXED) {
/* the parent is already trivial */
lineBidi.direction = paraBidi.direction;
/*
* The parent's levels are all either implicitly or explicitly ==paraLevel; do
* the same here.
*/
if (paraBidi.trailingWSStart <= start) {
lineBidi.trailingWSStart = 0;
} else if (paraBidi.trailingWSStart < limit) {
lineBidi.trailingWSStart = paraBidi.trailingWSStart - start;
} else {
lineBidi.trailingWSStart = length;
}
} else {
byte[] levels = lineBidi.levels;
int i, trailingWSStart;
byte level;
setTrailingWSStart(lineBidi);
trailingWSStart = lineBidi.trailingWSStart;
/* recalculate lineBidiBase.direction */
if (trailingWSStart == 0) {
/* all levels are at paraLevel */
lineBidi.direction = (byte) (lineBidi.paraLevel & 1);
} else {
/* get the level of the first character */
level = (byte) (levels[0] & 1);
/*
* if there is anything of a different level, then the line is mixed
*/
if (trailingWSStart < length && (lineBidi.paraLevel & 1) != level) {
/*
* the trailing WS is at paraLevel, which differs from levels[0]
*/
lineBidi.direction = BidiBase.MIXED;
} else {
/*
* see if levels[1..trailingWSStart-1] have the same direction as levels[0] and
* paraLevel
*/
for (i = 1;; i++) {
if (i == trailingWSStart) {
/* the direction values match those in level */
lineBidi.direction = level;
break;
} else if ((levels[i] & 1) != level) {
lineBidi.direction = BidiBase.MIXED;
break;
}
}
}
}
switch (lineBidi.direction) {
case Bidi.DIRECTION_LEFT_TO_RIGHT:
/* make sure paraLevel is even */
lineBidi.paraLevel = (byte) ((lineBidi.paraLevel + 1) & ~1);
/*
* all levels are implicitly at paraLevel (important for getLevels())
*/
lineBidi.trailingWSStart = 0;
break;
case Bidi.DIRECTION_RIGHT_TO_LEFT:
/* make sure paraLevel is odd */
lineBidi.paraLevel |= 1;
/*
* all levels are implicitly at paraLevel (important for getLevels())
*/
lineBidi.trailingWSStart = 0;
break;
default:
break;
}
}
lineBidi.paraBidi = paraBidi; /* mark successful setLine */
return newBidi;
}
static byte getLevelAt(BidiBase bidiBase, int charIndex) {
/* return paraLevel if in the trailing WS run, otherwise the real level */
if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) {
return bidiBase.GetParaLevelAt(charIndex);
} else {
return bidiBase.levels[charIndex];
}
}
static byte[] getLevels(BidiBase bidiBase) {
int start = bidiBase.trailingWSStart;
int length = bidiBase.length;
if (start != length) {
/* the current levels array does not reflect the WS run */
/*
* After the previous if(), we know that the levels array has an implicit
* trailing WS run and therefore does not fully reflect itself all the levels.
* This must be a Bidi object for a line, and we need to create a new levels
* array.
*/
/*
* bidiBase.paraLevel is ok even if contextual multiple paragraphs, since
* bidiBase is a line object
*/
Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel);
/* this new levels array is set for the line and reflects the WS run */
bidiBase.trailingWSStart = length;
}
if (length < bidiBase.levels.length) {
byte[] levels = new byte[length];
System.arraycopy(bidiBase.levels, 0, levels, 0, length);
return levels;
}
return bidiBase.levels;
}
static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
int start = bidiBase.runs[runIndex].start;
int limit;
byte level = bidiBase.runs[runIndex].level;
if (runIndex > 0) {
limit = start + bidiBase.runs[runIndex].limit - bidiBase.runs[runIndex - 1].limit;
} else {
limit = start + bidiBase.runs[0].limit;
}
return new BidiRun(start, limit, level);
}
/* in trivial cases there is only one trivial run; called by getRuns() */
private static void getSingleRun(BidiBase bidiBase, byte level) {
/* simple, single-run case */
bidiBase.runs = bidiBase.simpleRuns;
bidiBase.runCount = 1;
/* fill and reorder the single run */
bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level);
}
/* reorder the runs array (L2) ---------------------------------------------- */
/*
* Reorder the same-level runs in the runs array. Here, runCount>1 and
* maxLevel>=minLevel>=paraLevel. All the visualStart fields=logical start
* before reordering. The "odd" bits are not set yet.
*
* Reordering with this data structure lends itself to some handy shortcuts:
*
* Since each run is moved but not modified, and since at the initial maxLevel
* each sequence of same-level runs consists of only one run each, we don't need
* to do anything there and can predecrement maxLevel. In many simple cases, the
* reordering is thus done entirely in the index mapping. Also, reordering
* occurs only down to the lowest odd level that occurs, which is minLevel|1.
* However, if the lowest level itself is odd, then in the last reordering the
* sequence of the runs at this level or higher will be all runs, and we don't
* need the elaborate loop to search for them. This is covered by ++minLevel
* instead of minLevel|=1 followed by an extra reorder-all after the
* reorder-some loop. About a trailing WS run: Such a run would need special
* treatment because its level is not reflected in levels[] if this is not a
* paragraph object. Instead, all characters from trailingWSStart on are
* implicitly at paraLevel. However, for all maxLevel>paraLevel, this run will
* never be reordered and does not need to be taken into account.
* maxLevel==paraLevel is only reordered if minLevel==paraLevel is odd, which is
* done in the extra segment. This means that for the main reordering loop we
* don't need to consider this run and can --runCount. If it is later part of
* the all-runs reordering, then runCount is adjusted accordingly.
*/
private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) {
/* nothing to do? */
if (maxLevel <= (minLevel | 1)) {
return;
}
BidiRun[] runs;
BidiRun tempRun;
byte[] levels;
int firstRun, endRun, limitRun, runCount;
/*
* Reorder only down to the lowest odd level and reorder at an odd minLevel in a
* separate, simpler loop. See comments above for why minLevel is always
* incremented.
*/
++minLevel;
runs = bidiBase.runs;
levels = bidiBase.levels;
runCount = bidiBase.runCount;
/*
* do not include the WS run at paraLevel<=old minLevel except in the simple
* loop
*/
if (bidiBase.trailingWSStart < bidiBase.length) {
--runCount;
}
while (--maxLevel >= minLevel) {
firstRun = 0;
/* loop for all sequences of runs */
for (;;) {
/* look for a sequence of runs that are all at >=maxLevel */
/* look for the first run of such a sequence */
while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) {
++firstRun;
}
if (firstRun >= runCount) {
break; /* no more such runs */
}
/* look for the limit run of such a sequence (the run behind it) */
for (limitRun = firstRun; ++limitRun < runCount && levels[runs[limitRun].start] >= maxLevel;) {
}
/* Swap the entire sequence of runs from firstRun to limitRun-1. */
endRun = limitRun - 1;
while (firstRun < endRun) {
tempRun = runs[firstRun];
runs[firstRun] = runs[endRun];
runs[endRun] = tempRun;
++firstRun;
--endRun;
}
if (limitRun == runCount) {
break; /* no more such runs */
} else {
firstRun = limitRun + 1;
}
}
}
/* now do maxLevel==old minLevel (==odd!), see above */
if ((minLevel & 1) == 0) {
firstRun = 0;
/* include the trailing WS run in this complete reordering */
if (bidiBase.trailingWSStart == bidiBase.length) {
--runCount;
}
/* Swap the entire sequence of all runs. (endRun==runCount) */
while (firstRun < runCount) {
tempRun = runs[firstRun];
runs[firstRun] = runs[runCount];
runs[runCount] = tempRun;
++firstRun;
--runCount;
}
}
}
/* compute the runs array --------------------------------------------------- */
static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) {
BidiRun[] runs = bidiBase.runs;
int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart;
for (i = 0; i < runCount; i++) {
length = runs[i].limit - visualStart;
logicalStart = runs[i].start;
if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart + length))) {
return i;
}
visualStart += length;
}
/* we should never get here */
throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex");
}
/*
* Compute the runs array from the levels array. After getRuns() returns true,
* runCount is guaranteed to be >0 and the runs are reordered. Odd-level runs
* have visualStart on their visual right edge and they progress visually to the
* left. If option OPTION_INSERT_MARKS is set, insertRemove will contain the sum
* of appropriate LRM/RLM_BEFORE/AFTER flags. If option OPTION_REMOVE_CONTROLS
* is set, insertRemove will contain the negative number of BiDi control
* characters within this run.
*/
static void getRuns(BidiBase bidiBase) {
/*
* This method returns immediately if the runs are already set. This includes
* the case of length==0 (handled in setPara)..
*/
if (bidiBase.runCount >= 0) {
return;
}
if (bidiBase.direction != BidiBase.MIXED) {
/* simple, single-run case - this covers length==0 */
/* bidiBase.paraLevel is ok even for contextual multiple paragraphs */
getSingleRun(bidiBase, bidiBase.paraLevel);
} else /* BidiBase.MIXED, length>0 */ {
/* mixed directionality */
int length = bidiBase.length, limit;
byte[] levels = bidiBase.levels;
int i, runCount;
byte level = -1; /* initialize with no valid level */
/*
* If there are WS characters at the end of the line and the run preceding them
* has a level different from paraLevel, then they will form their own run at
* paraLevel (L1). Count them separately. We need some special treatment for
* this in order to not modify the levels array which a line Bidi object shares
* with its paragraph parent and its other line siblings. In other words, for
* the trailing WS, it may be levels[]!=paraLevel but we have to treat it like
* it were so.
*/
limit = bidiBase.trailingWSStart;
/* count the runs, there is at least one non-WS run, and limit>0 */
runCount = 0;
for (i = 0; i < limit; ++i) {
/* increment runCount at the start of each run */
if (levels[i] != level) {
++runCount;
level = levels[i];
}
}
/*
* We don't need to see if the last run can be merged with a trailing WS run
* because setTrailingWSStart() would have done that.
*/
if (runCount == 1 && limit == length) {
/* There is only one non-WS run and no trailing WS-run. */
getSingleRun(bidiBase, levels[0]);
} else /* runCount>1 || limit<length */ {
/* allocate and set the runs */
BidiRun[] runs;
int runIndex, start;
byte minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
byte maxLevel = 0;
/* now, count a (non-mergeable) WS run */
if (limit < length) {
++runCount;
}
/* runCount > 1 */
bidiBase.getRunsMemory(runCount);
runs = bidiBase.runsMemory;
/* set the runs */
/*
* FOOD FOR THOUGHT: this could be optimized, e.g.: 464->444, 484->444,
* 575->555, 595->555 However, that would take longer. Check also how it would
* interact with BiDi control removal and inserting Marks.
*/
runIndex = 0;
/*
* search for the run limits and initialize visualLimit values with the run
* lengths
*/
i = 0;
do {
/* prepare this run */
start = i;
level = levels[i];
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
/* look for the run limit */
while (++i < limit && levels[i] == level) {
}
/* i is another run limit */
runs[runIndex] = new BidiRun(start, i - start, level);
++runIndex;
} while (i < limit);
if (limit < length) {
/* there is a separate WS run */
runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel);
/*
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
* paragraphs.
*/
if (bidiBase.paraLevel < minLevel) {
minLevel = bidiBase.paraLevel;
}
}
/* set the object fields */
bidiBase.runs = runs;
bidiBase.runCount = runCount;
reorderLine(bidiBase, minLevel, maxLevel);
/* now add the direction flags and adjust the visualLimit's to be just that */
/* this loop will also handle the trailing WS run */
limit = 0;
for (i = 0; i < runCount; ++i) {
runs[i].level = levels[runs[i].start];
limit = (runs[i].limit += limit);
}
/* Set the embedding level for the trailing WS run. */
/* For a RTL paragraph, it will be the *first* run in visual order. */
/*
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
* paragraphs.
*/
if (runIndex < runCount) {
int trailingRun = ((bidiBase.paraLevel & 1) != 0) ? 0 : runIndex;
runs[trailingRun].level = bidiBase.paraLevel;
}
}
}
/* handle insert LRM/RLM BEFORE/AFTER run */
if (bidiBase.insertPoints.size > 0) {
BidiBase.Point point;
int runIndex, ip;
for (ip = 0; ip < bidiBase.insertPoints.size; ip++) {
point = bidiBase.insertPoints.points[ip];
runIndex = getRunFromLogicalIndex(bidiBase, point.pos);
bidiBase.runs[runIndex].insertRemove |= point.flag;
}
}
/* handle remove BiDi control characters */
if (bidiBase.controlCount > 0) {
int runIndex, ic;
char c;
for (ic = 0; ic < bidiBase.length; ic++) {
c = bidiBase.text[ic];
if (BidiBase.IsBidiControlChar(c)) {
runIndex = getRunFromLogicalIndex(bidiBase, ic);
bidiBase.runs[runIndex].insertRemove--;
}
}
}
}
static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel) {
int start;
byte level, minLevel, maxLevel;
if (levels == null || levels.length <= 0) {
return null;
}
/* determine minLevel and maxLevel */
minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
maxLevel = 0;
for (start = levels.length; start > 0;) {
level = levels[--start];
if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
return null;
}
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
}
pMinLevel[0] = minLevel;
pMaxLevel[0] = maxLevel;
/* initialize the index map */
int[] indexMap = new int[levels.length];
for (start = levels.length; start > 0;) {
--start;
indexMap[start] = start;
}
return indexMap;
}
static int[] reorderVisual(byte[] levels) {
byte[] aMinLevel = new byte[1];
byte[] aMaxLevel = new byte[1];
int start, end, limit, temp;
byte minLevel, maxLevel;
int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel);
if (indexMap == null) {
return null;
}
minLevel = aMinLevel[0];
maxLevel = aMaxLevel[0];
/* nothing to do? */
if (minLevel == maxLevel && (minLevel & 1) == 0) {
return indexMap;
}
/* reorder only down to the lowest odd level */
minLevel |= 1;
/* loop maxLevel..minLevel */
do {
start = 0;
/* loop for all sequences of levels to reorder at the current maxLevel */
for (;;) {
/* look for a sequence of levels that are all at >=maxLevel */
/* look for the first index of such a sequence */
while (start < levels.length && levels[start] < maxLevel) {
++start;
}
if (start >= levels.length) {
break; /* no more such runs */
}
/* look for the limit of such a sequence (the index behind it) */
for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel;) {
}
/*
* Swap the entire interval of indexes from start to limit-1. We don't need to
* swap the levels for the purpose of this algorithm: the sequence of levels
* that we look at does not move anyway.
*/
end = limit - 1;
while (start < end) {
temp = indexMap[start];
indexMap[start] = indexMap[end];
indexMap[end] = temp;
++start;
--end;
}
if (limit == levels.length) {
break; /* no more such sequences */
} else {
start = limit + 1;
}
}
} while (--maxLevel >= minLevel);
return indexMap;
}
static int[] getVisualMap(BidiBase bidiBase) {
/* fill a visual-to-logical index map using the runs[] */
BidiRun[] runs = bidiBase.runs;
int logicalStart, visualStart, visualLimit;
int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length : bidiBase.resultLength;
int[] indexMap = new int[allocLength];
visualStart = 0;
int idx = 0;
for (int j = 0; j < bidiBase.runCount; ++j) {
logicalStart = runs[j].start;
visualLimit = runs[j].limit;
if (runs[j].isEvenRun()) {
do { /* LTR */
indexMap[idx++] = logicalStart++;
} while (++visualStart < visualLimit);
} else {
logicalStart += visualLimit - visualStart; /* logicalLimit */
do { /* RTL */
indexMap[idx++] = --logicalStart;
} while (++visualStart < visualLimit);
}
/* visualStart==visualLimit; */
}
if (bidiBase.insertPoints.size > 0) {
int markFound = 0, runCount = bidiBase.runCount;
int insertRemove, i, j, k;
runs = bidiBase.runs;
/* count all inserted marks */
for (i = 0; i < runCount; i++) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
markFound++;
}
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
markFound++;
}
}
/* move back indexes by number of preceding marks */
k = bidiBase.resultLength;
for (i = runCount - 1; i >= 0 && markFound > 0; i--) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
visualStart = i > 0 ? runs[i - 1].limit : 0;
for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) {
indexMap[--k] = indexMap[j];
}
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
}
} else if (bidiBase.controlCount > 0) {
int runCount = bidiBase.runCount, logicalEnd;
int insertRemove, length, i, j, k, m;
char uchar;
boolean evenRun;
runs = bidiBase.runs;
visualStart = 0;
/* move forward indexes by number of preceding controls */
k = 0;
for (i = 0; i < runCount; i++, visualStart += length) {
length = runs[i].limit - visualStart;
insertRemove = runs[i].insertRemove;
/* if no control found yet, nothing to do in this run */
if ((insertRemove == 0) && (k == visualStart)) {
k += length;
continue;
}
/* if no control in this run */
if (insertRemove == 0) {
visualLimit = runs[i].limit;
for (j = visualStart; j < visualLimit; j++) {
indexMap[k++] = indexMap[j];
}
continue;
}
logicalStart = runs[i].start;
evenRun = runs[i].isEvenRun();
logicalEnd = logicalStart + length - 1;
for (j = 0; j < length; j++) {
m = evenRun ? logicalStart + j : logicalEnd - j;
uchar = bidiBase.text[m];
if (!BidiBase.IsBidiControlChar(uchar)) {
indexMap[k++] = m;
}
}
}
}
if (allocLength == bidiBase.resultLength) {
return indexMap;
}
int[] newMap = new int[bidiBase.resultLength];
System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength);
return newMap;
}
}

View File

@ -0,0 +1,123 @@
/*
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package jdk_internal.bidi.icu.text;
/**
* A BidiRun represents a sequence of characters at the same embedding level.
* The Bidi algorithm decomposes a piece of text into sequences of characters at
* the same embedding level, each such sequence is called a "run".
*
* <p>
* A BidiRun represents such a run by storing its essential properties, but does
* not duplicate the characters which form the run.
*
* <p>
* The &quot;limit&quot; of the run is the position just after the last
* character, i.e., one more than that position.
*
* <p>
* This class has no public constructor, and its members cannot be modified by
* users.
*
* @see com.ibm.icu.text.Bidi
*/
class BidiRun {
int start; /* first logical position of the run */
int limit; /* last visual position of the run +1 */
int insertRemove; /*
* if >0, flags for inserting LRM/RLM before/after run, if <0, count of bidi
* controls within run
*/
byte level;
/*
* Default constructor
*
* Note that members start and limit of a run instance have different meanings
* depending whether the run is part of the runs array of a Bidi object, or if
* it is a reference returned by getVisualRun() or getLogicalRun(). For a member
* of the runs array of a Bidi object, - start is the first logical position of
* the run in the source text. - limit is one after the last visual position of
* the run. For a reference returned by getLogicalRun() or getVisualRun(), -
* start is the first logical position of the run in the source text. - limit is
* one after the last logical position of the run.
*/
BidiRun() {
this(0, 0, (byte) 0);
}
/*
* Constructor
*/
BidiRun(int start, int limit, byte embeddingLevel) {
this.start = start;
this.limit = limit;
this.level = embeddingLevel;
}
/*
* Copy the content of a BidiRun instance
*/
void copyFrom(BidiRun run) {
this.start = run.start;
this.limit = run.limit;
this.level = run.level;
this.insertRemove = run.insertRemove;
}
/**
* Get level of run
*/
byte getEmbeddingLevel() {
return level;
}
/**
* Check if run level is even
*
* @return true if the embedding level of this run is even, i.e. it is a
* left-to-right run.
*/
boolean isEvenRun() {
return (level & 1) == 0;
}
}

View File

@ -0,0 +1,425 @@
/*
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package jdk_internal.bidi.icu.text;
import jdk_internal.bidi.icu.lang.UCharacter;
final class BidiWriter {
/** Bidi control code points */
static final char LRM_CHAR = 0x200e;
static final char RLM_CHAR = 0x200f;
static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | 1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
private static boolean IsCombining(int type) {
return ((1 << type & (1 << UCharacter.NON_SPACING_MARK | 1 << UCharacter.COMBINING_SPACING_MARK
| 1 << UCharacter.ENCLOSING_MARK)) != 0);
}
/*
* When we have OUTPUT_REVERSE set on writeReordered(), then we semantically
* write RTL runs in reverse and later reverse them again. Instead, we actually
* write them in forward order to begin with. However, if the RTL run was to be
* mirrored, we need to mirror here now since the implicit second reversal must
* not do it. It looks strange to do mirroring in LTR output, but it is only
* because we are writing RTL output in reverse.
*/
private static String doWriteForward(String src, int options) {
/* optimize for several combinations of options */
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING)) {
case 0: {
/* simply return the LTR run */
return src;
}
case BidiBase.DO_MIRRORING: {
StringBuffer dest = new StringBuffer(src.length());
/* do mirroring */
int i = 0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
UTF16.append(dest, UCharacter.getMirror(c));
} while (i < src.length());
return dest.toString();
}
case BidiBase.REMOVE_BIDI_CONTROLS: {
StringBuilder dest = new StringBuilder(src.length());
/* copy the LTR run and remove any Bidi control characters */
int i = 0;
char c;
do {
c = src.charAt(i++);
if (!BidiBase.IsBidiControlChar(c)) {
dest.append(c);
}
} while (i < src.length());
return dest.toString();
}
default: {
StringBuffer dest = new StringBuffer(src.length());
/* remove Bidi control characters and do mirroring */
int i = 0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
if (!BidiBase.IsBidiControlChar(c)) {
UTF16.append(dest, UCharacter.getMirror(c));
}
} while (i < src.length());
return dest.toString();
}
} /* end of switch */
}
private static String doWriteForward(char[] text, int start, int limit, int options) {
return doWriteForward(new String(text, start, limit - start), options);
}
static String writeReverse(String src, int options) {
/*
* RTL run -
*
* RTL runs need to be copied to the destination in reverse order of code
* points, not code units, to keep Unicode characters intact.
*
* The general strategy for this is to read the source text in backward order,
* collect all code units for a code point (and optionally following combining
* characters, see below), and copy all these code units in ascending order to
* the destination for this run.
*
* Several options request whether combining characters should be kept after
* their base characters, whether Bidi control characters should be removed, and
* whether characters should be replaced by their mirror-image equivalent
* Unicode characters.
*/
StringBuffer dest = new StringBuffer(src.length());
/* optimize for several combinations of options */
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING | BidiBase.KEEP_BASE_COMBINING)) {
case 0:
/*
* With none of the "complicated" options set, the destination run will have the
* same length as the source run, and there is no mirroring and no keeping
* combining characters with their base characters.
*
* XXX: or dest = UTF16.reverse(new StringBuffer(src));
*/
int srcLength = src.length();
/* preserve character integrity */
do {
/*
* i is always after the last code unit known to need to be kept in this segment
*/
int i = srcLength;
/* collect code units for one base character */
srcLength -= UTF16.getCharCount(UTF16.charAt(src, srcLength - 1));
/* copy this base character */
dest.append(src.substring(srcLength, i));
} while (srcLength > 0);
break;
case BidiBase.KEEP_BASE_COMBINING:
/*
* Here, too, the destination run will have the same length as the source run,
* and there is no mirroring. We do need to keep combining characters with their
* base characters.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/*
* i is always after the last code unit known to need to be kept in this segment
*/
int c;
int i = srcLength;
/*
* collect code units and modifier letters for one base character
*/
do {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
} while (srcLength > 0 && IsCombining(UCharacter.getType(c)));
/* copy this "user character" */
dest.append(src.substring(srcLength, i));
} while (srcLength > 0);
break;
default:
/*
* With several "complicated" options set, this is the most general and the
* slowest copying of an RTL run. We will do mirroring, remove Bidi controls,
* and keep combining characters with their base characters as requested.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/*
* i is always after the last code unit known to need to be kept in this segment
*/
int i = srcLength;
/* collect code units for one base character */
int c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
/* collect modifier letters for this base character */
while (srcLength > 0 && IsCombining(UCharacter.getType(c))) {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
}
}
if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && BidiBase.IsBidiControlChar(c)) {
/* do not copy this Bidi control character */
continue;
}
/* copy this "user character" */
int j = srcLength;
if ((options & BidiBase.DO_MIRRORING) != 0) {
/* mirror only the base character */
c = UCharacter.getMirror(c);
UTF16.append(dest, c);
j += UTF16.getCharCount(c);
}
dest.append(src.substring(j, i));
} while (srcLength > 0);
break;
} /* end of switch */
return dest.toString();
}
static String doWriteReverse(char[] text, int start, int limit, int options) {
return writeReverse(new String(text, start, limit - start), options);
}
static String writeReordered(BidiBase bidi, int options) {
int run, runCount;
StringBuilder dest;
char[] text = bidi.text;
runCount = bidi.countRuns();
/*
* Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
* reordering mode (checked below) is appropriate.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
}
/*
* Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS and cancels
* BidiBase.INSERT_LRM_FOR_NUMERIC.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
options |= BidiBase.REMOVE_BIDI_CONTROLS;
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
/*
* If we do not perform the "inverse Bidi" algorithm, then we don't need to
* insert any LRMs, and don't need to test for it.
*/
if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L)
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT)
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL)
&& (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? bidi.length * 2 : bidi.length);
/*
* Iterate through all visual runs and copy the run text segments to the
* destination, according to the options.
*
* The tests for where to insert LRMs ignore the fact that there may be BN codes
* or non-BMP code points at the beginning and end of a run; they may insert
* LRMs unnecessarily but the tests are faster this way (this would have to be
* improved for UTF-8).
*/
if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
/* forward output */
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
char uc;
int markFlag;
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
markFlag = 0;
/* check if something relevant in insertPoints */
markFlag = bidi.runs[run].insertRemove;
if (markFlag < 0) { /* bidi controls count */
markFlag = 0;
}
if (bidiRun.isEvenRun()) {
if (bidi.isInverse() && dirProps[bidiRun.start] != BidiBase.L) {
markFlag |= BidiBase.LRM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
if (bidi.isInverse() && dirProps[bidiRun.limit - 1] != BidiBase.L) {
markFlag |= BidiBase.LRM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
} else { /* RTL run */
if (bidi.isInverse() && !bidi.testDirPropFlagAt(MASK_R_AL, bidiRun.limit - 1)) {
markFlag |= BidiBase.RLM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
if (bidi.isInverse() && (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
markFlag |= BidiBase.RLM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
}
}
}
} else {
/* reverse output */
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for (run = runCount; --run >= 0;) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
for (run = runCount; --run >= 0;) {
/* reverse output */
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
dest.append(LRM_CHAR);
}
dest.append(
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
if (dirProps[bidiRun.start] != BidiBase.L) {
dest.append(LRM_CHAR);
}
} else {
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
dest.append(RLM_CHAR);
}
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
dest.append(RLM_CHAR);
}
}
}
}
}
return dest.toString();
}
}

View File

@ -0,0 +1,271 @@
/*
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
import java.io.IOException;
/**
* Normalization filtered by a UnicodeSet. Normalizes portions of the text
* contained in the filter set and leaves portions not contained in the filter
* set unchanged. Filtering is done via UnicodeSet.span(...,
* UnicodeSet.SpanCondition.SIMPLE). Not-in-the-filter text is treated as "is
* normalized" and "quick check yes". This class implements all of (and only)
* the Normalizer2 API. An instance of this class is unmodifiable/immutable.
*
* @stable ICU 4.4
* @author Markus W. Scherer
*/
class FilteredNormalizer2 extends Normalizer2 {
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance and a
* filter set. Both are aliased and must not be modified or deleted while this
* object is used. The filter set should be frozen; otherwise the performance
* will suffer greatly.
*
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @stable ICU 4.4
*/
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
norm2 = n2;
set = filterSet;
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if (dest == src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
return dest;
}
/**
* {@inheritDoc}
*
* @stable ICU 4.6
*/
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if (dest == src) {
throw new IllegalArgumentException();
}
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
/**
* {@inheritDoc}
*
* @stable ICU 4.6
*/
@Override
public String getDecomposition(int c) {
return set.contains(c) ? norm2.getDecomposition(c) : null;
}
/**
* {@inheritDoc}
*
* @stable ICU 49
*/
@Override
public int getCombiningClass(int c) {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public boolean isNormalized(CharSequence s) {
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
} else {
if (!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
return false;
}
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit = spanLimit;
}
return true;
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public int spanQuickCheckYes(CharSequence s) {
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
} else {
int yesLimit = prevSpanLimit + norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
if (yesLimit < spanLimit) {
return yesLimit;
}
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit = spanLimit;
}
return s.length();
}
/**
* {@inheritDoc}
*
* @stable ICU 4.4
*/
@Override
public boolean hasBoundaryBefore(int c) {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue
// after
// an in-filter prefix.
private Appendable normalize(CharSequence src, Appendable dest, UnicodeSet.SpanCondition spanCondition) {
// Don't throw away destination buffer between iterations.
StringBuilder tempDest = new StringBuilder();
try {
for (int prevSpanLimit = 0; prevSpanLimit < src.length();) {
int spanLimit = set.span(src, prevSpanLimit, spanCondition);
int spanLength = spanLimit - prevSpanLimit;
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
if (spanLength != 0) {
dest.append(src, prevSpanLimit, spanLimit);
}
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
} else {
if (spanLength != 0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
}
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit = spanLimit;
}
} catch (IOException e) {
throw new InternalError(e.toString(), e);
}
return dest;
}
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, boolean doNormalize) {
if (first == second) {
throw new IllegalArgumentException();
}
if (first.length() == 0) {
if (doNormalize) {
return normalize(second, first);
} else {
return first.append(second);
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of
// the second
int prefixLimit = set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
if (prefixLimit != 0) {
CharSequence prefix = second.subSequence(0, prefixLimit);
int suffixStart = set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
if (suffixStart == 0) {
if (doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix);
} else {
norm2.append(first, prefix);
}
} else {
StringBuilder middle = new StringBuilder(first.subSequence(suffixStart, first.length()));
if (doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix);
} else {
norm2.append(middle, prefix);
}
first.delete(suffixStart, 0x7fffffff).append(middle);
}
}
if (prefixLimit < second.length()) {
CharSequence rest = second.subSequence(prefixLimit, second.length());
if (doNormalize) {
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
} else {
first.append(rest);
}
}
return first;
}
private Normalizer2 norm2;
private UnicodeSet set;
};

View File

@ -0,0 +1,288 @@
/*
* Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
import jdk_internal.bidi.icu.impl.Norm2AllModes;
/**
* Unicode normalization functionality for standard Unicode normalization or for
* using custom mapping tables. All instances of this class are
* unmodifiable/immutable. The Normalizer2 class is not intended for public
* subclassing.
* <p>
* The primary functions are to produce a normalized string and to detect
* whether a string is already normalized. The most commonly used normalization
* forms are those defined in
* <a href="http://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
* Unicode Normalization Forms</a>. However, this API supports additional
* normalization forms for specialized purposes. For example, NFKC_Casefold is
* provided via getInstance("nfkc_cf", COMPOSE) and can be used in
* implementations of UTS #46.
* <p>
* Not only are the standard compose and decompose modes supplied, but
* additional modes are provided as documented in the Mode enum.
* <p>
* Some of the functions in this class identify normalization boundaries. At a
* normalization boundary, the portions of the string before it and starting
* from it do not interact and can be handled independently.
* <p>
* The spanQuickCheckYes() stops at a normalization boundary. When the goal is a
* normalized string, then the text before the boundary can be copied, and the
* remainder can be processed with normalizeSecondAndAppend().
* <p>
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test
* whether a character is guaranteed to be at a normalization boundary,
* regardless of context. This is used for moving from one normalization
* boundary to the next or preceding boundary, and for performing iterative
* normalization.
* <p>
* Iterative normalization is useful when only a small portion of a longer
* string needs to be processed. For example, in ICU, iterative normalization is
* used by the NormalizationTransliterator (to avoid replacing
* already-normalized text) and ucol_nextSortKeyPart() (to process only the
* substring for which sort key bytes are computed).
* <p>
* The set of normalization boundaries returned by these functions may not be
* complete: There may be more boundaries that could be returned. Different
* functions may return different boundaries.
*
* @stable ICU 4.4
* @author Markus W. Scherer
*/
public abstract class Normalizer2 {
/**
* Returns a Normalizer2 instance for Unicode NFC normalization. Same as
* getInstance(null, "nfc", Mode.COMPOSE). Returns an unmodifiable singleton
* instance.
*
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFCInstance() {
return Norm2AllModes.getNFCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFD normalization. Same as
* getInstance(null, "nfc", Mode.DECOMPOSE). Returns an unmodifiable singleton
* instance.
*
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFDInstance() {
return Norm2AllModes.getNFCInstance().decomp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKC normalization. Same as
* getInstance(null, "nfkc", Mode.COMPOSE). Returns an unmodifiable singleton
* instance.
*
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKCInstance() {
return Norm2AllModes.getNFKCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKD normalization. Same as
* getInstance(null, "nfkc", Mode.DECOMPOSE). Returns an unmodifiable singleton
* instance.
*
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKDInstance() {
return Norm2AllModes.getNFKCInstance().decomp;
}
/**
* Returns the normalized form of the source string.
*
* @param src source string
* @return normalized src
* @stable ICU 4.4
*/
public String normalize(CharSequence src) {
if (src instanceof String) {
// Fastpath: Do not construct a new String if the src is a String
// and is already normalized.
int spanLength = spanQuickCheckYes(src);
if (spanLength == src.length()) {
return (String) src;
}
if (spanLength != 0) {
StringBuilder sb = new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
}
}
return normalize(src, new StringBuilder(src.length())).toString();
}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string. The source and
* destination strings must be different objects.
*
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @return dest
* @stable ICU 4.4
*/
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
/**
* Writes the normalized form of the source string to the destination Appendable
* and returns the destination Appendable. The source and destination strings
* must be different objects.
*
* <p>
* Any {@link java.io.IOException} is wrapped into a
* {@link com.ibm.icu.util.ICUUncheckedIOException}.
*
* @param src source string
* @param dest destination Appendable; gets normalized src appended
* @return dest
* @stable ICU 4.6
*/
public abstract Appendable normalize(CharSequence src, Appendable dest);
/**
* Appends the normalized form of the second string to the first string (merging
* them at the boundary) and returns the first string. The result is normalized
* if the first string was normalized. The first and second strings must be
* different objects.
*
* @param first string, should be normalized
* @param second string, will be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second);
/**
* Appends the second string to the first string (merging them at the boundary)
* and returns the first string. The result is normalized if both the strings
* were normalized. The first and second strings must be different objects.
*
* @param first string, should be normalized
* @param second string, should be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder append(StringBuilder first, CharSequence second);
/**
* Gets the decomposition mapping of c. Roughly equivalent to normalizing the
* String form of c on a DECOMPOSE Normalizer2 instance, but much faster, and
* except that this function returns null if c does not have a decomposition
* mapping in this instance's data. This function is independent of the mode of
* the Normalizer2.
*
* @param c code point
* @return c's decomposition mapping, if any; otherwise null
* @stable ICU 4.6
*/
public abstract String getDecomposition(int c);
/**
* Gets the combining class of c. The default implementation returns 0 but all
* standard implementations return the Unicode Canonical_Combining_Class value.
*
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
public int getCombiningClass(int c) {
return 0;
}
/**
* Tests if the string is normalized. Internally, in cases where the
* quickCheck() method would return "maybe" (which is only possible for the two
* COMPOSE modes) this method resolves to "yes" or "no" to provide a definitive
* result, at the cost of doing more work in those cases.
*
* @param s input string
* @return true if s is normalized
* @stable ICU 4.4
*/
public abstract boolean isNormalized(CharSequence s);
/**
* Returns the end of the normalized substring of the input string. In other
* words, with <code>end=spanQuickCheckYes(s);</code> the substring
* <code>s.subSequence(0, end)</code> will pass the quick check with a "yes"
* result.
* <p>
* The returned end index is usually one or more characters before the "no" or
* "maybe" character: The end index is at a normalization boundary. (See the
* class documentation for more about normalization boundaries.)
* <p>
* When the goal is a normalized string and most input strings are expected to
* be normalized already, then call this method, and if it returns a prefix
* shorter than the input string, copy that prefix and use
* normalizeSecondAndAppend() for the remainder.
*
* @param s input string
* @return "yes" span end index
* @stable ICU 4.4
*/
public abstract int spanQuickCheckYes(CharSequence s);
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context. If true, then the character does not
* normalization-interact with preceding characters. In other words, a string
* containing this character can be normalized by processing portions before
* this character and starting from this character independently. This is used
* for iterative normalization. See the class documentation for details.
*
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
public abstract boolean hasBoundaryBefore(int c);
/**
* Sole constructor. (For invocation by subclass constructors, typically
* implicit.)
*
* @internal deprecated This API is ICU internal only.
*/
protected Normalizer2() {
}
}

View File

@ -0,0 +1,791 @@
/*
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2000-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
import jdk_internal.bidi.CharacterIterator;
import jdk_internal.bidi.Normalizer;
import jdk_internal.bidi.icu.impl.Norm2AllModes;
/**
* Unicode Normalization
*
* <h2>Unicode normalization API</h2>
*
* <code>normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/reports/tr15/" target="unicode"> Unicode
* Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in several
* different ways in Unicode. For example, take the character A-acute. In
* Unicode, this can be encoded as a single character (the "composed" form):
*
* <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
* </pre>
*
* or as two separate characters (the "decomposed" form):
*
* <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
* </pre>
*
* To a user of your program, however, both of these sequences should be treated
* as the same "user-level" character "A with acute accent". When you are
* searching or comparing text, you must ensure that these two sequences are
* treated equivalently. In addition, you must handle characters with more than
* one accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
* <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
* </pre>
*
* or as the single character
*
* <pre>
* FB03 LATIN SMALL LIGATURE FFI
* </pre>
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility with
* existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions into
* the corresponding semantic characters. When sorting and searching, you will
* often want to use these mappings.
*
* <code>normalize</code> helps solve these problems by transforming text into
* the canonical composed and decomposed forms as shown in the first example
* above. In addition, you can have it perform compatibility decompositions so
* that you can treat compatibility characters the same as their equivalents.
* Finally, <code>normalize</code> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
*
* Form FCD, "Fast C or D", is also designed for collation. It allows to work on
* strings that are not necessarily normalized with an algorithm (like in
* collation) that works under "canonical closure", i.e., it treats precomposed
* characters and their decomposed equivalents the same.
*
* It is not a normalization form because it does not provide for uniqueness of
* representation. Multiple strings may be canonically equivalent (their NFDs
* are identical) and may all conform to FCD without being identical themselves.
*
* The form is defined such that the "raw decomposition", the recursive
* canonical decomposition of each character, results in a string that is
* canonically ordered. This means that precomposed characters are allowed for
* as long as their decompositions do not need canonical reordering.
*
* Its advantage for a process like collation is that all NFD and most NFC texts
* - and many unnormalized texts - already conform to FCD and do not need to be
* normalized (NFD) for such a process. The FCD quick check will return YES for
* most strings in practice.
*
* normalize(FCD) may be implemented with NFD.
*
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence
* in Applications): http://www.unicode.org/notes/tn5/#FCD
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
* string search, normalized strings may be useful for string equivalence
* comparisons, transliteration/transcription, unique representations, etc.
*
* The W3C generally recommends to exchange texts in NFC. Note also that most
* legacy character encodings use only precomposed forms and often do not encode
* any combining marks by themselves. For conversion to such character encodings
* the Unicode text needs to be normalized to NFC. For more usage examples, see
* the Unicode Standard Annex.
*
* Note: The Normalizer class also provides API for iterative normalization.
* While the setIndex() and getIndex() refer to indices in the underlying
* Unicode input text, the next() and previous() methods iterate through
* characters in the normalized output. This means that there is not necessarily
* a one-to-one correspondence between characters returned by next() and
* previous() and the indices passed to and returned from setIndex() and
* getIndex(). It is for this reason that Normalizer does not implement the
* CharacterIterator interface.
*
* @stable ICU 2.8
*/
// Original filename in ICU4J: Normalizer.java
public final class NormalizerBase implements Cloneable {
// The input text and our position in it
private UCharacterIterator text;
private Normalizer2 norm2;
private Mode mode;
private int options;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex] .
private int currentIndex;
private int nextIndex;
// A buffer for holding intermediate results
private StringBuilder buffer;
private int bufferPos;
// Helper classes to defer loading of normalization data.
private static final class ModeImpl {
private ModeImpl(Normalizer2 n2) {
normalizer2 = n2;
}
private final Normalizer2 normalizer2;
}
private static final class NFDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
}
private static final class NFKDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
}
private static final class NFCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
}
private static final class NFKCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
}
private static final class Unicode32 {
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
}
private static final class NFD32ModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(
new FilteredNormalizer2(Normalizer2.getNFDInstance(), Unicode32.INSTANCE));
}
private static final class NFKD32ModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(
new FilteredNormalizer2(Normalizer2.getNFKDInstance(), Unicode32.INSTANCE));
}
private static final class NFC32ModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(
new FilteredNormalizer2(Normalizer2.getNFCInstance(), Unicode32.INSTANCE));
}
private static final class NFKC32ModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(
new FilteredNormalizer2(Normalizer2.getNFKCInstance(), Unicode32.INSTANCE));
}
/**
* Options bit set value to select Unicode 3.2 normalization (except
* NormalizationCorrections). At most one Unicode version can be selected at a
* time.
*
* @stable ICU 2.6
*/
public static final int UNICODE_3_2 = 0x20;
public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2;
/*
* Default option for the latest Unicode normalization. This option is provided
* mainly for testing. The value zero means that normalization is done with the
* fixes for - Corrigendum 4 (Five CJK Canonical Mapping Errors) - Corrigendum 5
* (Normalization Idempotency)
*/
public static final int UNICODE_LATEST = 0x00;
/**
* Constant indicating that the end of the iteration has been reached. This is
* guaranteed to have the same value as {@link UCharacterIterator#DONE}.
*
* @stable ICU 2.8
*/
public static final int DONE = UCharacterIterator.DONE;
/**
* Constants for normalization modes.
* <p>
* The Mode class is not intended for public subclassing. Only the Mode
* constants provided by the Normalizer class should be used, and any fields or
* methods should not be called or overridden by users.
*
* @stable ICU 2.8
*/
public abstract static class Mode {
/**
* Sole constructor
*
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected Mode() {
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected abstract Normalizer2 getNormalizer2(int options);
}
private static Mode toMode(Normalizer.Form form) {
switch (form) {
case NFC:
return NFC;
case NFD:
return NFD;
case NFKC:
return NFKC;
case NFKD:
return NFKD;
}
throw new IllegalArgumentException("Unexpected normalization form: " + form);
}
private static final class NONEMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return Norm2AllModes.NOOP_NORMALIZER2;
}
}
private static final class NFDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options & UNICODE_3_2) != 0 ? NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options & UNICODE_3_2) != 0 ? NFKD32ModeImpl.INSTANCE.normalizer2
: NFKDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options & UNICODE_3_2) != 0 ? NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options & UNICODE_3_2) != 0 ? NFKC32ModeImpl.INSTANCE.normalizer2
: NFKCModeImpl.INSTANCE.normalizer2;
}
}
/**
* No decomposition/composition.
*
* @stable ICU 2.8
*/
public static final Mode NONE = new NONEMode();
/**
* Canonical decomposition.
*
* @stable ICU 2.8
*/
public static final Mode NFD = new NFDMode();
/**
* Compatibility decomposition.
*
* @stable ICU 2.8
*/
public static final Mode NFKD = new NFKDMode();
/**
* Canonical decomposition followed by canonical composition.
*
* @stable ICU 2.8
*/
public static final Mode NFC = new NFCMode();
public static final Mode NFKC = new NFKCMode();
// -------------------------------------------------------------------------
// Iterator constructors
// -------------------------------------------------------------------------
/**
* Creates a new {@code NormalizerBase} object for iterating over the normalized
* form of a given string.
* <p>
* The {@code options} parameter specifies which optional {@code NormalizerBase}
* features are to be enabled for this object.
* <p>
*
* @param str The string to be normalized. The normalization will start at the
* beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled. Currently the only available
* option is {@link #UNICODE_3_2}. If you want the default behavior
* corresponding to one of the standard Unicode Normalization Forms,
* use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(String str, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance(str);
this.mode = mode;
this.options = opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(String str, Mode mode) {
this(str, mode, 0);
}
/**
* Creates a new {@code NormalizerBase} object for iterating over the normalized
* form of the given text.
* <p>
*
* @param iter The input text to be normalized. The normalization will start at
* the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled. Currently the only available
* option is {@link #UNICODE_3_2}. If you want the default behavior
* corresponding to one of the standard Unicode Normalization Forms,
* use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance((CharacterIterator) iter.clone());
this.mode = mode;
this.options = opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(CharacterIterator iter, Mode mode) {
this(iter, mode, 0);
}
/**
* Clones this {@code NormalizerBase} object. All properties of this object are
* duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor or to
* {@link #setText(CharacterIterator) setText}. However, the text storage
* underlying the {@code CharacterIterator} is not duplicated unless the
* iterator's {@code clone} method does so.
*
* @stable ICU 2.8
*/
public Object clone() {
try {
NormalizerBase copy = (NormalizerBase) super.clone();
copy.text = (UCharacterIterator) text.clone();
copy.mode = mode;
copy.options = options;
copy.norm2 = norm2;
copy.buffer = new StringBuilder(buffer);
copy.bufferPos = bufferPos;
copy.currentIndex = currentIndex;
copy.nextIndex = nextIndex;
return copy;
} catch (CloneNotSupportedException e) {
throw new InternalError(e.toString(), e);
}
}
/**
* Normalizes a {@code String} using the given normalization operation.
* <p>
* The {@code options} parameter specifies which optional {@code NormalizerBase}
* features are to be enabled for this operation. Currently the only available
* option is {@link #UNICODE_3_2}. If you want the default behavior
* corresponding to one of the standard Unicode Normalization Forms, use 0 for
* this argument.
* <p>
*
* @param str the input string to be normalized.
* @param mode the normalization mode
* @param options the optional features to be enabled.
* @return String the normalized string
* @stable ICU 2.6
*/
public static String normalize(String str, Mode mode, int options) {
return mode.getNormalizer2(options).normalize(str);
}
public static String normalize(String str, Normalizer.Form form) {
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
}
public static String normalize(String str, Normalizer.Form form, int options) {
return NormalizerBase.normalize(str, toMode(form), options);
}
/**
* Test if a string is in a given normalization form. This is semantically
* equivalent to source.equals(normalize(source, mode)).
*
* Unlike quickCheck(), this function returns a definitive result, never a
* "maybe". For NFD, NFKD, and FCD, both functions work exactly the same. For
* NFC and NFKC where quickCheck may return "maybe", this function will perform
* further tests to arrive at a true/false result.
*
* @param str the input string to be checked to see if it is normalized
* @param mode the normalization mode
* @param options Options for use with exclusion set and tailored Normalization
* The only option that is currently recognized is UNICODE_3_2
* @see #isNormalized
* @stable ICU 2.6
*/
public static boolean isNormalized(String str, Mode mode, int options) {
return mode.getNormalizer2(options).isNormalized(str);
}
public static boolean isNormalized(String str, Normalizer.Form form) {
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
}
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
return NormalizerBase.isNormalized(str, toMode(form), options);
}
// -------------------------------------------------------------------------
// Iteration API
// -------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
*
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int current() {
if (bufferPos < buffer.length() || nextNormalize()) {
return buffer.codePointAt(bufferPos);
} else {
return DONE;
}
}
/**
* Return the next character in the normalized text and advance the iteration
* position by one. If the end of the text has already been reached,
* {@link #DONE} is returned.
*
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int next() {
if (bufferPos < buffer.length() || nextNormalize()) {
int c = buffer.codePointAt(bufferPos);
bufferPos += Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Return the previous character in the normalized text and decrement the
* iteration position by one. If the beginning of the text has already been
* reached, {@link #DONE} is returned.
*
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int previous() {
if (bufferPos > 0 || previousNormalize()) {
int c = buffer.codePointBefore(bufferPos);
bufferPos -= Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Reset the index to the beginning of the text. This is equivalent to
* setIndexOnly(startIndex)).
*
* @stable ICU 2.8
*/
public void reset() {
text.setIndex(0);
currentIndex = nextIndex = 0;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized,
* without any immediate normalization. After setIndexOnly(), getIndex() will
* return the same index that is specified here.
*
* @param index the desired index in the input text.
* @stable ICU 2.8
*/
public void setIndexOnly(int index) {
text.setIndex(index); // validates index
currentIndex = nextIndex = index;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized and
* return the first normalized character at that position.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text, while
* {@link #next} and {@link #previous} iterate through characters in the
* normalized <em>output</em>. This means that there is not necessarily a
* one-to-one correspondence between characters returned by {@code next} and
* {@code previous} and the indices passed to and returned from {@code setIndex}
* and {@link #getIndex}.
* <p>
*
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than
* {@link #getEndIndex}. deprecated ICU 3.2
* @obsolete ICU 3.2
*/
public int setIndex(int index) {
setIndexOnly(index);
return current();
}
/**
* Retrieve the index of the start of the input text. This is the begin index of
* the {@code CharacterIterator} or the start (i.e. 0) of the {@code String}
* over which this {@code NormalizerBase} is iterating
*
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
*/
@Deprecated
public int getBeginIndex() {
return 0;
}
/**
* Retrieve the index of the end of the input text. This is the end index of the
* {@code CharacterIterator} or the length of the {@code String} over which this
* {@code NormalizerBase} is iterating
*
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
*/
@Deprecated
public int getEndIndex() {
return endIndex();
}
/**
* Retrieve the current iteration position in the input text that is being
* normalized. This method is useful in applications such as searching, where
* you need to be able to determine the position in the input text that
* corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
* correspondence between characters returned by {@code next} and
* {@code previous} and the indices passed to and returned from {@code setIndex}
* and {@link #getIndex}.
*
* @return The current iteration position
* @stable ICU 2.8
*/
public int getIndex() {
if (bufferPos < buffer.length()) {
return currentIndex;
} else {
return nextIndex;
}
}
/**
* Retrieve the index of the end of the input text. This is the end index of the
* {@code CharacterIterator} or the length of the {@code String} over which this
* {@code NormalizerBase} is iterating
*
* @return The current iteration position
* @stable ICU 2.8
*/
public int endIndex() {
return text.getLength();
}
// -------------------------------------------------------------------------
// Iterator attributes
// -------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating over a
* string, calls to {@link #next} and {@link #previous} may return previously
* buffers characters in the old normalization mode until the iteration is able
* to re-sync at the next base character. It is safest to call {@link #setText
* setText()}, {@link #first}, {@link #last}, etc. after calling
* {@code setMode}.
* <p>
*
* @param newMode the new mode for this {@code NormalizerBase}. The supported
* modes are:
* <ul>
* <li>{@link #NFC} - Unicode canonical decompositiion followed
* by canonical composition.
* <li>{@link #NFKC} - Unicode compatibility decompositiion
* follwed by canonical composition.
* <li>{@link #NFD} - Unicode canonical decomposition
* <li>{@link #NFKD} - Unicode compatibility decomposition.
* <li>{@link #NONE} - Do nothing but return characters from the
* underlying input text.
* </ul>
*
* @see #getMode
* @stable ICU 2.8
*/
public void setMode(Mode newMode) {
mode = newMode;
norm2 = mode.getNormalizer2(options);
}
/**
* Return the basic operation performed by this {@code NormalizerBase}
*
* @see #setMode
* @stable ICU 2.8
*/
public Mode getMode() {
return mode;
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate. The
* iteration position is set to the beginning of the input text.
*
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
reset();
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate. The
* iteration position is set to the beginning of the input text.
*
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
currentIndex = nextIndex = 0;
clearBuffer();
}
private void clearBuffer() {
buffer.setLength(0);
bufferPos = 0;
}
private boolean nextNormalize() {
clearBuffer();
currentIndex = nextIndex;
text.setIndex(nextIndex);
// Skip at least one character so we make progress.
int c = text.nextCodePoint();
if (c < 0) {
return false;
}
StringBuilder segment = new StringBuilder().appendCodePoint(c);
while ((c = text.nextCodePoint()) >= 0) {
if (norm2.hasBoundaryBefore(c)) {
text.moveCodePointIndex(-1);
break;
}
segment.appendCodePoint(c);
}
nextIndex = text.getIndex();
norm2.normalize(segment, buffer);
return buffer.length() != 0;
}
private boolean previousNormalize() {
clearBuffer();
nextIndex = currentIndex;
text.setIndex(currentIndex);
StringBuilder segment = new StringBuilder();
int c;
while ((c = text.previousCodePoint()) >= 0) {
if (c <= 0xffff) {
segment.insert(0, (char) c);
} else {
segment.insert(0, Character.toChars(c));
}
if (norm2.hasBoundaryBefore(c)) {
break;
}
}
currentIndex = text.getIndex();
norm2.normalize(segment, buffer);
bufferPos = buffer.length();
return buffer.length() != 0;
}
}

View File

@ -0,0 +1,124 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
/**
* <code>Replaceable</code> is an interface representing a string of characters
* that supports the replacement of a range of itself with a new string of
* characters. It is used by APIs that change a piece of text while retaining
* metadata. Metadata is data other than the Unicode characters returned by
* char32At(). One example of metadata is style attributes; another is an edit
* history, marking each character with an author and revision number.
*
* <p>
* An implicit aspect of the <code>Replaceable</code> API is that during a
* replace operation, new characters take on the metadata of the old characters.
* For example, if the string "the <b>bold</b> font" has range (4, 8) replaced
* with "strong", then it becomes "the <b>strong</b> font".
*
* <p>
* <code>Replaceable</code> specifies ranges using a start offset and a limit
* offset. The range of characters thus specified includes the characters at
* offset start..limit-1. That is, the start offset is inclusive, and the limit
* offset is exclusive.
*
* <p>
* <code>Replaceable</code> also includes API to access characters in the
* string: <code>length()</code>, <code>charAt()</code>,
* <code>char32At()</code>, and <code>extractBetween()</code>.
*
* <p>
* For a subclass to support metadata, typical behavior of
* <code>replace()</code> is the following:
* <ul>
* <li>Set the metadata of the new text to the metadata of the first character
* replaced</li>
* <li>If no characters are replaced, use the metadata of the previous
* character</li>
* <li>If there is no previous character (i.e. start == 0), use the following
* character</li>
* <li>If there is no following character (i.e. the replaceable was empty), use
* default metadata</li>
* <li>If the code point U+FFFF is seen, it should be interpreted as a special
* marker having no metadata</li>
* </ul>
* If this is not the behavior, the subclass should document any differences.
*
* <p>
* Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @stable ICU 2.0
*/
public interface Replaceable {
/**
* Returns the number of 16-bit code units in the text.
*
* @return number of 16-bit code units in text
* @stable ICU 2.0
*/
int length();
/**
* Returns the 16-bit code unit at the given offset into the text.
*
* @param offset an integer between 0 and <code>length()</code>-1 inclusive
* @return 16-bit code unit of text at given offset
* @stable ICU 2.0
*/
char charAt(int offset);
/**
* Copies characters from this object into the destination character array. The
* first character to be copied is at index <code>srcStart</code>; the last
* character to be copied is at index <code>srcLimit-1</code> (thus the total
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code> starting at index
* <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
}

View File

@ -0,0 +1,121 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
/**
* <code>ReplaceableString</code> is an adapter class that implements the
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
*
* <p>
* <em>Note:</em> This class does not support attributes and is not intended for
* general use. Most clients will need to implement {@link Replaceable} in their
* text representation class.
*
* <p>
* Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @see Replaceable
* @author Alan Liu
* @stable ICU 2.0
*/
public class ReplaceableString implements Replaceable {
private StringBuffer buf;
/**
* Construct a new object with the given initial contents.
*
* @param str initial contents
* @stable ICU 2.0
*/
public ReplaceableString(String str) {
buf = new StringBuffer(str);
}
/**
* Construct a new object using <code>buf</code> for internal storage. The
* contents of <code>buf</code> at the time of construction are used as the
* initial contents. <em>Note! Modifications to <code>buf</code> will modify
* this object, and vice versa.</em>
*
* @param buf object to be used as internal storage
* @stable ICU 2.0
*/
public ReplaceableString(StringBuffer buf) {
this.buf = buf;
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.
*
* @stable ICU 2.0
*/
public int length() {
return buf.length();
}
/**
* Return the character at the given position in this object.
* <code>Replaceable</code> API.
*
* @param offset offset into the contents, from 0 to <code>length()</code> - 1
* @stable ICU 2.0
*/
public char charAt(int offset) {
return buf.charAt(offset);
}
/**
* Copies characters from this object into the destination character array. The
* first character to be copied is at index <code>srcStart</code>; the last
* character to be copied is at index <code>srcLimit-1</code> (thus the total
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code> starting at index
* <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
if (srcStart != srcLimit) {
buf.getChars(srcStart, srcLimit, dst, dstStart);
}
}
}

View File

@ -0,0 +1,493 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
/*
*******************************************************************************
* Copyright (C) 2003-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
//
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
// - move from package com.ibm.icu.text to package sun.net.idn
// - use ParseException instead of StringPrepParseException
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
// - remove all @deprecated tag to make compiler happy
// 2007-08-14 Martin Buchholz
// - remove redundant casts
//
package jdk_internal.bidi.icu.text;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import jdk_internal.bidi.Normalizer;
import jdk_internal.bidi.ParseException;
import jdk_internal.bidi.SunNormalizer;
import jdk_internal.bidi.icu.impl.CharTrie;
import jdk_internal.bidi.icu.impl.StringPrepDataReader;
import jdk_internal.bidi.icu.impl.Trie;
import jdk_internal.bidi.icu.lang.UCharacter;
import jdk_internal.bidi.icu.lang.UCharacterDirection;
import jdk_internal.bidi.icu.util.VersionInfo;
/**
* StringPrep API implements the StingPrep framework as described by
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. StringPrep
* prepares Unicode strings for use in network protocols. Profiles of StingPrep
* are set of rules and data according to which the Unicode Strings are
* prepared. Each profiles contains tables which describe how a code point
* should be treated. The tables are broadly classied into
* <ul>
* <li>Unassigned Table: Contains code points that are unassigned in the Unicode
* Version supported by StringPrep. Currently RFC 3454 supports Unicode 3.2.
* </li>
* <li>Prohibited Table: Contains code points that are prohibted from the output
* of the StringPrep processing function.</li>
* <li>Mapping Table: Contains code ponts that are deleted from the output or
* case mapped.</li>
* </ul>
*
* The procedure for preparing Unicode strings:
* <ol>
* <li>Map: For each character in the input, check if it has a mapping and, if
* so, replace it with its mapping.</li>
* <li>Normalize: Possibly normalize the result of step 1 using Unicode
* normalization.</li>
* <li>Prohibit: Check for any characters that are not allowed in the output. If
* any are found, return an error.</li>
* <li>Check bidi: Possibly check for right-to-left characters, and if any are
* found, make sure that the whole string satisfies the requirements for
* bidirectional strings. If the string does not satisfy the requirements for
* bidirectional strings, return an error.</li>
* </ol>
*
* @author Ram Viswanadha
* @draft ICU 2.8
*/
public final class StringPrep {
/**
* Option to prohibit processing of unassigned code points in the input
*
* @see #prepare
* @draft ICU 2.8
*/
public static final int DEFAULT = 0x0000;
/**
* Option to allow processing of unassigned code points in the input
*
* @see #prepare
* @draft ICU 2.8
*/
public static final int ALLOW_UNASSIGNED = 0x0001;
private static final int UNASSIGNED = 0x0000;
private static final int MAP = 0x0001;
private static final int PROHIBITED = 0x0002;
private static final int DELETE = 0x0003;
private static final int TYPE_LIMIT = 0x0004;
private static final int NORMALIZATION_ON = 0x0001;
private static final int CHECK_BIDI_ON = 0x0002;
private static final int TYPE_THRESHOLD = 0xFFF0;
private static final int MAX_INDEX_VALUE = 0x3FBF; /* 16139 */
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
/* indexes[] value names */
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /*
* The index of Unicode version of last entry in
* NormalizationCorrections.txt
*/
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /*
* The starting index of 1 UChar mapping index in the
* mapping data array
*/
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /*
* The starting index of 2 UChars mapping index in
* the mapping data array
*/
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE = 25000;
/* Wrappers for Trie implementations */
private static final class StringPrepTrieImpl implements Trie.DataManipulate {
private CharTrie sprepTrie = null;
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's data the
* index array offset of the indexes for that lead surrogate.
*
* @param property data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value) {
return value;
}
}
// CharTrie implementation for reading the trie data
private StringPrepTrieImpl sprepTrieImpl;
// Indexes read from the data file
private int[] indexes;
// mapping data read from the data file
private char[] mappingData;
// format version of the data file
private byte[] formatVersion;
// the version of Unicode supported by the data file
private VersionInfo sprepUniVer;
// the Unicode version of last entry in the
// NormalizationCorrections.txt file if normalization
// is turned on
private VersionInfo normCorrVer;
// Option to turn on Normalization
private boolean doNFKC;
// Option to turn on checking for BiDi rules
private boolean checkBiDi;
private char getCodePointValue(int ch) {
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
}
private static VersionInfo getVersionInfo(int comp) {
int micro = comp & 0xFF;
int milli = (comp >> 8) & 0xFF;
int minor = (comp >> 16) & 0xFF;
int major = (comp >> 24) & 0xFF;
return VersionInfo.getInstance(major, minor, milli, micro);
}
private static VersionInfo getVersionInfo(byte[] version) {
if (version.length != 4) {
return null;
}
return VersionInfo.getInstance((int) version[0], (int) version[1], (int) version[2], (int) version[3]);
}
/**
* Creates an StringPrep object after reading the input stream. The object does
* not hold a reference to the input steam, so the stream can be closed after
* the method returns.
*
* @param inputStream The stream for reading the StringPrep profile binarySun
* @throws IOException
* @draft ICU 2.8
*/
public StringPrep(InputStream inputStream) throws IOException {
BufferedInputStream b = new BufferedInputStream(inputStream, DATA_BUFFER_SIZE);
StringPrepDataReader reader = new StringPrepDataReader(b);
// read the indexes
indexes = reader.readIndexes(INDEX_TOP);
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
// indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
// load the rest of the data and initialize the data members
reader.read(sprepBytes, mappingData);
sprepTrieImpl = new StringPrepTrieImpl();
sprepTrieImpl.sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes), sprepTrieImpl);
// get the data format version
formatVersion = reader.getDataFormatVersion();
// get the options
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if (normUniVer.compareTo(sprepUniVer) < 0 && /*
* the Unicode version of SPREP file must be less than the
* Unicode Vesion of the normalization data
*/
normUniVer.compareTo(normCorrVer) < 0
&& /*
* the Unicode version of the NormalizationCorrections.txt file should be less
* than the Unicode Vesion of the normalization data
*/
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on */
) {
throw new IOException("Normalization Correction version not supported");
}
b.close();
}
private static final class Values {
boolean isIndex;
int value;
int type;
public void reset() {
isIndex = false;
value = 0;
type = -1;
}
}
private static final void getValues(char trieWord, Values values) {
values.reset();
if (trieWord == 0) {
/*
* Initial value stored in the mapping table just return TYPE_LIMIT .. so that
* the source codepoint is copied to the destination
*/
values.type = TYPE_LIMIT;
} else if (trieWord >= TYPE_THRESHOLD) {
values.type = (trieWord - TYPE_THRESHOLD);
} else {
/* get the type */
values.type = MAP;
/* ascertain if the value is index or delta */
if ((trieWord & 0x02) > 0) {
values.isIndex = true;
values.value = trieWord >> 2; // mask off the lower 2 bits and shift
} else {
values.isIndex = false;
values.value = (trieWord << 16) >> 16;
values.value = (values.value >> 2);
}
if ((trieWord >> 2) == MAX_INDEX_VALUE) {
values.type = DELETE;
values.isIndex = false;
values.value = 0;
}
}
}
private StringBuffer map(UCharacterIterator iter, int options) throws ParseException {
Values val = new Values();
char result = 0;
int ch = UCharacterIterator.DONE;
StringBuffer dest = new StringBuffer();
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
result = getCodePointValue(ch);
getValues(result, val);
// check if the source codepoint is unassigned
if (val.type == UNASSIGNED && allowUnassigned == false) {
throw new ParseException("An unassigned code point was found in the input " + iter.getText(),
iter.getIndex());
} else if ((val.type == MAP)) {
int index, length;
if (val.isIndex) {
index = val.value;
if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
&& index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
length = 1;
} else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
&& index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
length = 2;
} else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
&& index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
length = 3;
} else {
length = mappingData[index++];
}
/* copy mapping to destination */
dest.append(mappingData, index, length);
continue;
} else {
ch -= val.value;
}
} else if (val.type == DELETE) {
// just consume the codepoint and contine
continue;
}
// copy the source into destination
UTF16.append(dest, ch);
}
return dest;
}
private StringBuffer normalize(StringBuffer src) {
/*
* Option UNORM_BEFORE_PRI_29:
*
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
* requires strict adherence to Unicode 3.2 normalization, including buggy
* composition from before fixing Public Review Issue #29. Note that this
* results in some valid but nonsensical text to be either corrupted or
* rejected, depending on the text. See
* http://www.unicode.org/review/resolved-pri.html#pri29 See unorm.cpp and
* cnormtst.c
*/
return new StringBuffer(
SunNormalizer.normalize(src.toString(), Normalizer.Form.NFKC, SunNormalizer.UNICODE_3_2));
}
/*
* boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if(
* (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; }
*/
/*
* 1) Map -- For each character in the input, check if it has a mapping and, if
* so, replace it with its mapping.
*
* 2) Normalize -- Possibly normalize the result of step 1 using Unicode
* normalization.
*
* 3) Prohibit -- Check for any characters that are not allowed in the output.
* If any are found, return an error.
*
* 4) Check bidi -- Possibly check for right-to-left characters, and if any are
* found, make sure that the whole string satisfies the requirements for
* bidirectional strings. If the string does not satisfy the requirements for
* bidirectional strings, return an error. [Unicode3.2] defines several
* bidirectional categories; each character has one bidirectional category
* assigned to it. For the purposes of the requirements below, an
* "RandALCat character" is a character that has Unicode bidirectional
* categories "R" or "AL"; an "LCat character" is a character that has Unicode
* bidirectional category "L". Note
*
*
* that there are many characters which fall in neither of the above
* definitions; Latin digits (<U+0030> through <U+0039>) are examples of this
* because they have bidirectional category "EN".
*
* In any profile that specifies bidirectional character handling, all three of
* the following requirements MUST be met:
*
* 1) The characters in section 5.8 MUST be prohibited.
*
* 2) If a string contains any RandALCat character, the string MUST NOT contain
* any LCat character.
*
* 3) If a string contains any RandALCat character, a RandALCat character MUST
* be the first character of the string, and a RandALCat character MUST be the
* last character of the string.
*/
/**
* Prepare the input buffer for use in applications with the given profile. This
* operation maps, normalizes(NFKC), checks for prohited and BiDi characters in
* the order defined by RFC 3454 depending on the options specified in the
* profile.
*
* @param src A UCharacterIterator object containing the source string
* @param options A bit set of options:
*
* - StringPrep.NONE Prohibit processing of unassigned code
* points in the input
*
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points
* are in the input as normal Unicode code points.
*
* @return StringBuffer A StringBuffer containing the output
* @throws ParseException
* @draft ICU 2.8
*/
public StringBuffer prepare(UCharacterIterator src, int options) throws ParseException {
// map
StringBuffer mapOut = map(src, options);
StringBuffer normOut = mapOut;// initialize
if (doNFKC) {
// normalize
normOut = normalize(mapOut);
}
int ch;
char result;
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
Values val = new Values();
int direction = UCharacterDirection.CHAR_DIRECTION_COUNT,
firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
int rtlPos = -1, ltrPos = -1;
boolean rightToLeft = false, leftToRight = false;
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
result = getCodePointValue(ch);
getValues(result, val);
if (val.type == PROHIBITED) {
throw new ParseException("A prohibited code point was found in the input" + iter.getText(), val.value);
}
direction = UCharacter.getDirection(ch);
if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
firstCharDir = direction;
}
if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
leftToRight = true;
ltrPos = iter.getIndex() - 1;
}
if (direction == UCharacterDirection.RIGHT_TO_LEFT
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
rightToLeft = true;
rtlPos = iter.getIndex() - 1;
}
}
if (checkBiDi == true) {
// satisfy 2
if (leftToRight == true && rightToLeft == true) {
throw new ParseException(
"The input does not conform to the rules for BiDi code points." + iter.getText(),
(rtlPos > ltrPos) ? rtlPos : ltrPos);
}
// satisfy 3
if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT
|| firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
&& (direction == UCharacterDirection.RIGHT_TO_LEFT
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
throw new ParseException(
"The input does not conform to the rules for BiDi code points." + iter.getText(),
(rtlPos > ltrPos) ? rtlPos : ltrPos);
}
}
return normOut;
}
}

View File

@ -0,0 +1,326 @@
/*
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
import jdk_internal.bidi.CharacterIterator;
import jdk_internal.bidi.icu.impl.CharacterIteratorWrapper;
import jdk_internal.bidi.icu.impl.ReplaceableUCharacterIterator;
import jdk_internal.bidi.icu.impl.UCharacterProperty;
/**
* Abstract class that defines an API for iteration on text objects.This is an
* interface for forward and backward iteration and random access into a text
* object. Forward iteration is done with post-increment and backward iteration
* is done with pre-decrement semantics, while the
* <code>java.text.CharacterIterator</code> interface methods provided forward
* iteration with "pre-increment" and backward iteration with pre-decrement
* semantics. This API is more efficient for forward iteration over code points.
* The other major difference is that this API can do both code unit and code
* point iteration, <code>java.text.CharacterIterator</code> can only iterate
* over code units and is limited to BMP (0 - 0xFFFF)
*
* @author Ram
* @stable ICU 2.4
*/
public abstract class UCharacterIterator implements Cloneable {
/**
* Protected default constructor for the subclasses
*
* @stable ICU 2.4
*/
protected UCharacterIterator() {
}
/**
* Indicator that we have reached the ends of the UTF16 text. Moved from
* UForwardCharacterIterator.java
*
* @stable ICU 2.4
*/
public static final int DONE = -1;
// static final methods ----------------------------------------------------
/**
* Returns a <code>UCharacterIterator</code> object given a source string.
*
* @param source a string
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(String source) {
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a source StringBuffer.
*
* @param source an string buffer of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(StringBuffer source) {
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a CharacterIterator.
*
* @param source a valid CharacterIterator object.
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(CharacterIterator source) {
return new CharacterIteratorWrapper(source);
}
// public methods ----------------------------------------------------------
/**
* Returns the length of the text
*
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
/**
* Gets the current index in text.
*
* @return current index in text.
* @stable ICU 2.4
*/
public abstract int getIndex();
/**
* Returns the UTF16 code unit at index, and increments to the next code unit
* (post-increment semantics). If index is out of range, DONE is returned, and
* the iterator is reset to the limit of the text.
*
* @return the next UTF16 code unit, or DONE if the index is at the limit of the
* text.
* @stable ICU 2.4
*/
public abstract int next();
/**
* Returns the code point at index, and increments to the next code point
* (post-increment semantics). If index does not point to a valid surrogate
* pair, the behavior is the same as <code>next()</code>. Otherwise the iterator
* is incremented past the surrogate pair, and the code point represented by the
* pair is returned.
*
* @return the next codepoint in text, or DONE if the index is at the limit of
* the text.
* @stable ICU 2.4
*/
public int nextCodePoint() {
int ch1 = next();
if (UTF16.isLeadSurrogate((char) ch1)) {
int ch2 = next();
if (UTF16.isTrailSurrogate((char) ch2)) {
return UCharacterProperty.getRawSupplementary((char) ch1, (char) ch2);
} else if (ch2 != DONE) {
// unmatched surrogate so back out
previous();
}
}
return ch1;
}
/**
* Decrement to the position of the previous code unit in the text, and return
* it (pre-decrement semantics). If the resulting index is less than 0, the
* index is reset to 0 and DONE is returned.
*
* @return the previous code unit in the text, or DONE if the new index is
* before the start of the text.
* @stable ICU 2.4
*/
public abstract int previous();
/**
* Retreat to the start of the previous code point in the text, and return it
* (pre-decrement semantics). If the index is not preceeded by a valid surrogate
* pair, the behavior is the same as <code>previous()</code>. Otherwise the
* iterator is decremented to the start of the surrogate pair, and the code
* point represented by the pair is returned.
*
* @return the previous code point in the text, or DONE if the new index is
* before the start of the text.
* @stable ICU 2.4
*/
public int previousCodePoint() {
int ch1 = previous();
if (UTF16.isTrailSurrogate((char) ch1)) {
int ch2 = previous();
if (UTF16.isLeadSurrogate((char) ch2)) {
return UCharacterProperty.getRawSupplementary((char) ch2, (char) ch1);
} else if (ch2 != DONE) {
// unmatched trail surrogate so back out
next();
}
}
return ch1;
}
/**
* Sets the index to the specified index in the text.
*
* @param index the index within the text.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
* @stable ICU 2.4
*/
public abstract void setIndex(int index);
/**
* Sets the current index to the start.
*
* @stable ICU 2.4
*/
public void setToStart() {
setIndex(0);
}
/**
* Fills the buffer with the underlying text storage of the iterator If the
* buffer capacity is not enough a exception is thrown. The capacity of the fill
* in buffer should at least be equal to length of text in the iterator obtained
* by calling <code>getLength()</code>. <b>Usage:</b>
*
* <pre>{@code
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
* char[] buf = new char[iter.getLength()];
* iter.getText(buf);
*
* OR
* char[] buf= new char[1];
* int len = 0;
* for(;;){
* try{
* len = iter.getText(buf);
* break;
* }catch(IndexOutOfBoundsException e){
* buf = new char[iter.getLength()];
* }
* }
* }</pre>
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough room
* after offset in the array, or if offset
* < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
/**
* Convenience override for <code>getText(char[], int)</code> that provides an
* offset of 0.
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough room in
* the array.
* @stable ICU 2.4
*/
public final int getText(char[] fillIn) {
return getText(fillIn, 0);
}
/**
* Convenience method for returning the underlying text storage as a string
*
* @return the underlying text storage in the iterator as a string
* @stable ICU 2.4
*/
public String getText() {
char[] text = new char[getLength()];
getText(text);
return new String(text);
}
/**
* Moves the current position by the number of code points specified, either
* forward or backward depending on the sign of delta (positive or negative
* respectively). If the current index is at a trail surrogate then the first
* adjustment is by code unit, and the remaining adjustments are by code points.
* If the resulting index would be less than zero, the index is set to zero, and
* if the resulting index would be greater than limit, the index is set to
* limit.
*
* @param delta the number of code units to move the current index.
* @return the new index
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
* @stable ICU 2.4
*
*/
public int moveCodePointIndex(int delta) {
if (delta > 0) {
while (delta > 0 && nextCodePoint() != DONE) {
delta--;
}
} else {
while (delta < 0 && previousCodePoint() != DONE) {
delta++;
}
}
if (delta != 0) {
throw new IndexOutOfBoundsException();
}
return getIndex();
}
/**
* Creates a copy of this iterator, independent from other iterators. If it is
* not possible to clone the iterator, returns null.
*
* @return copy of this iterator
* @stable ICU 2.4
*/
public Object clone() throws CloneNotSupportedException {
return super.clone();
}
}

View File

@ -0,0 +1,609 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package jdk_internal.bidi.icu.text;
import jdk_internal.bidi.icu.impl.UCharacterProperty;
/**
* <p>
* Standalone utility class providing UTF16 character conversions and indexing
* conversions.
* <p>
* Code that uses strings alone rarely need modification. By design, UTF-16 does
* not allow overlap, so searching for strings is a safe operation. Similarly,
* concatenation is always safe. Substringing is safe if the start and end are
* both on UTF-32 boundaries. In normal code, the values for start and end are
* on those boundaries, since they arose from operations like searching. If not,
* the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
* <strong>Examples:</strong>
* <p>
* The following examples illustrate use of some of these methods.
*
* <pre>{@code
* // iteration forwards: Original
* for (int i = 0; i < s.length(); ++i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration forwards: Changes for UTF-32
* int ch;
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
* for (int i = s.length() - 1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
* }</pre>
*
* <strong>Notes:</strong>
* <ul>
* <li><strong>Naming:</strong> For clarity, High and Low surrogates are called
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
* sense of their ordering in a string. <code>offset16</code> and
* <code>offset32</code> are used to distinguish offsets to UTF-16 boundaries vs
* offsets to UTF-32 boundaries. <code>int char32</code> is used to contain
* UTF-32 characters, as opposed to <code>char16</code>, which is a UTF-16 code
* unit.</li>
* <li><strong>Roundtripping Offsets:</strong> You can always roundtrip from a
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and back
* if and only if <code>bounds(string, offset16) != TRAIL</code>.</li>
* <li><strong>Exceptions:</strong> The error checking will throw an exception
* if indices are out of bounds. Other than that, all methods will behave
* reasonably, even if unmatched surrogates or out-of-bounds UTF-32 values are
* present. <code>UCharacter.isLegal()</code> can be used to check for validity
* if desired.</li>
* <li><strong>Unmatched Surrogates:</strong> If the string contains unmatched
* surrogates, then these are counted as one UTF-32 value. This matches their
* iteration behavior, which is vital. It also matches common display practice
* as missing glyphs (see the Unicode Standard Section 5.4, 5.5).</li>
* <li><strong>Optimization:</strong> The method implementations may need
* optimization if the compiler doesn't fold static final methods. Since
* surrogate pairs will form an exceeding small percentage of all the text in
* the world, the singleton case should always be optimized for.</li>
* </ul>
*
* @author Mark Davis, with help from Markus Scherer
* @stable ICU 2.1
*/
public final class UTF16 {
// public variables ---------------------------------------------------
/**
* The lowest Unicode code point value.
*
* @stable ICU 2.1
*/
public static final int CODEPOINT_MIN_VALUE = 0;
/**
* The highest Unicode code point value (scalar value) according to the Unicode
* Standard.
*
* @stable ICU 2.1
*/
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
/**
* The minimum value for Supplementary code points
*
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Lead surrogate minimum value
*
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
/**
* Trail surrogate minimum value
*
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
/**
* Lead surrogate maximum value
*
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
/**
* Trail surrogate maximum value
*
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
/**
* Surrogate minimum value
*
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
/**
* Lead surrogate bitmask
*/
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Trail surrogate bitmask
*/
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Surrogate bitmask
*/
private static final int SURROGATE_BITMASK = 0xFFFFF800;
/**
* Lead surrogate bits
*/
private static final int LEAD_SURROGATE_BITS = 0xD800;
/**
* Trail surrogate bits
*/
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
/**
* Surrogate bits
*/
private static final int SURROGATE_BITS = 0xD800;
// constructor --------------------------------------------------------
// /CLOVER:OFF
/**
* Prevent instance from being created.
*/
private UTF16() {
}
// /CLOVER:ON
// public method ------------------------------------------------------
/**
* Extract a single UTF-32 value from a string. Used when iterating forwards or
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">
* UCharacter.isLegal()</a></code> on the return value. If the char retrieved is
* part of a surrogate pair, its supplementary character will be returned. If a
* complete supplementary character is not found the incomplete character will
* be returned
*
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a string. Used when iterating forwards or
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value. If the char retrieved is part of a surrogate
* pair, its supplementary character will be returned. If a complete
* supplementary character is not found the incomplete character will be
* returned
*
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(CharSequence source, int offset16) {
char single = source.charAt(offset16);
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(CharSequence source, int offset16, char single) {
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a substring. Used when iterating forwards
* or backwards (with <code>UTF16.getCharCount()</code>, as well as random
* access. If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value. If the char retrieved is part of a surrogate
* pair, its supplementary character will be returned. If a complete
* supplementary character is not found the incomplete character will be
* returned
*
* @param source Array of UTF-16 chars
* @param start Offset to substring in the source array for analyzing
* @param limit Offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the
* range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
} else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Determines how many chars this char32 requires. If a validity check is
* required, use <code>
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
*
* @param char32 the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
* @stable ICU 2.1
*/
public static int getCharCount(int char32) {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Determines whether the code value is a surrogate.
*
* @param char16 the input character.
* @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16) {
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
*
* @param char16 the input character.
* @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16) {
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
*
* @param char16 the input character.
* @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16) {
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Returns the lead surrogate. If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
*
* @param char32 the input character.
* @return lead surrogate if the getCharCount(ch) is 2; <br>
* and 0 otherwise (note: 0 is not a valid lead surrogate).
* @stable ICU 2.1
*/
public static char getLeadSurrogate(int char32) {
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
}
return 0;
}
/**
* Returns the trail surrogate. If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
*
* @param char32 the input character.
* @return the trail surrogate if the getCharCount(ch) is 2; <br>
* otherwise the character itself
* @stable ICU 2.1
*/
public static char getTrailSurrogate(int char32) {
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
}
return (char) char32;
}
/**
* Convenience method corresponding to String.valueOf(char). Returns a one or
* two char string containing the UTF-32 value in UTF16 format. If a validity
* check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
*
* @param char32 the input character.
* @return string value of char32 in UTF16 format
* @exception IllegalArgumentException thrown if char32 is a invalid codepoint.
* @stable ICU 2.1
*/
public static String valueOf(int char32) {
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint");
}
return toString(char32);
}
/**
* Append a single UTF-32 value to the end of a StringBuffer. If a validity
* check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
*
* @param target the buffer to append to
* @param char32 value to append.
* @return the updated StringBuffer
* @exception IllegalArgumentException thrown when char32 does not lie within
* the range of the Unicode codepoints
* @stable ICU 2.1
*/
public static StringBuffer append(StringBuffer target, int char32) {
// Check for irregular values
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
}
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
} else {
target.append((char) char32);
}
return target;
}
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
*
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
* @param offset16 UTF16 position to shift relative to start
* @param shift32 number of codepoints to shift
* @return new shifted offset16 relative to start
* @exception IndexOutOfBoundsException if the new offset16 is out of bounds
* with respect to the subarray or the
* subarray bounds are out of range.
* @stable ICU 2.1
*/
public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) {
int size = source.length;
int count;
char ch;
int result = offset16 + start;
if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0) {
ch = source[result];
if (isLeadSurrogate(ch) && (result + 1 < limit) && isTrailSurrogate(source[result + 1])) {
result++;
}
count--;
result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
for (count = -shift32; count > 0; count--) {
result--;
if (result < start) {
break;
}
ch = source[result];
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
return result;
}
// private data members -------------------------------------------------
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
*/
private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
- (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
* <p>
* Converts argument code point and returns a String object representing the
* code point's value in UTF16 format.
* <p>
* This method does not check for the validity of the codepoint, the results are
* not guaranteed if a invalid codepoint is passed as argument.
* <p>
* The result is a string whose length is 1 for non-supplementary code points, 2
* otherwise.
*
* @param ch code point
* @return string representation of the code point
*/
private static String toString(int ch) {
if (ch < SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char) ch);
}
StringBuilder result = new StringBuilder();
result.append(getLeadSurrogate(ch));
result.append(getTrailSurrogate(ch));
return result.toString();
}
}

File diff suppressed because it is too large Load Diff