mirror of
https://github.com/Eaglercraft-Archive/Eaglercraftx-1.8.8-src.git
synced 2025-06-27 18:38:14 -05:00
Update #48 - Added some features from OptiFine
This commit is contained in:
530
sources/main/java/jdk_internal/icu/impl/BMPSet.java
Normal file
530
sources/main/java/jdk_internal/icu/impl/BMPSet.java
Normal file
@ -0,0 +1,530 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import jdk_internal.icu.text.UnicodeSet.SpanCondition;
|
||||
import jdk_internal.icu.util.OutputInt;
|
||||
|
||||
/**
|
||||
* Helper class for frozen UnicodeSets, implements contains() and span()
|
||||
* optimized for BMP code points.
|
||||
*
|
||||
* Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte
|
||||
* characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with
|
||||
* mixed for illegal ranges. Supplementary characters: Call contains() on the
|
||||
* parent set.
|
||||
*/
|
||||
public final class BMPSet {
|
||||
|
||||
/**
|
||||
* One boolean ('true' or 'false') per Latin-1 character.
|
||||
*/
|
||||
private boolean[] latin1Contains;
|
||||
|
||||
/**
|
||||
* One bit per code point from U+0000..U+07FF. The bits are organized
|
||||
* vertically; consecutive code points correspond to the same bit positions in
|
||||
* consecutive table words. With code point parts lead=c{10..6} trail=c{5..0} it
|
||||
* is set.contains(c)==(table7FF[trail] bit lead)
|
||||
*
|
||||
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
|
||||
* for faster validity checking at runtime.
|
||||
*/
|
||||
private int[] table7FF;
|
||||
|
||||
/**
|
||||
* One bit per 64 BMP code points. The bits are organized vertically;
|
||||
* consecutive 64-code point blocks correspond to the same bit position in
|
||||
* consecutive table words. With code point parts lead=c{15..12} t1=c{11..6}
|
||||
* test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then
|
||||
* the lower bit indicates if contains(c) for all code points in the 64-block.
|
||||
* If the upper bit is 1, then the block is mixed and set.contains(c) must be
|
||||
* called.
|
||||
*
|
||||
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of
|
||||
* contains(FFFD) for faster validity checking at runtime.
|
||||
*/
|
||||
private int[] bmpBlockBits;
|
||||
|
||||
/**
|
||||
* Inversion list indexes for restricted binary searches in findCodePoint(),
|
||||
* from findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000). U+0800 is
|
||||
* the first 3-byte-UTF-8 code point. Code points below U+0800 are always looked
|
||||
* up in the bit tables. The last pair of indexes is for finding supplementary
|
||||
* code points.
|
||||
*/
|
||||
private int[] list4kStarts;
|
||||
|
||||
/**
|
||||
* The inversion list of the parent set, for the slower contains()
|
||||
* implementation for mixed BMP blocks and for supplementary code points. The
|
||||
* list is terminated with list[listLength-1]=0x110000.
|
||||
*/
|
||||
private final int[] list;
|
||||
private final int listLength; // length used; list may be longer to minimize reallocs
|
||||
|
||||
public BMPSet(final int[] parentList, int parentListLength) {
|
||||
list = parentList;
|
||||
listLength = parentListLength;
|
||||
latin1Contains = new boolean[0x100];
|
||||
table7FF = new int[64];
|
||||
bmpBlockBits = new int[64];
|
||||
list4kStarts = new int[18];
|
||||
|
||||
/*
|
||||
* Set the list indexes for binary searches for U+0800, U+1000, U+2000, ..,
|
||||
* U+F000, U+10000. U+0800 is the first 3-byte-UTF-8 code point. Lower code
|
||||
* points are looked up in the bit tables. The last pair of indexes is for
|
||||
* finding supplementary code points.
|
||||
*/
|
||||
list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
|
||||
int i;
|
||||
for (i = 1; i <= 0x10; ++i) {
|
||||
list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
|
||||
}
|
||||
list4kStarts[0x11] = listLength - 1;
|
||||
|
||||
initBits();
|
||||
}
|
||||
|
||||
public boolean contains(int c) {
|
||||
if (c <= 0xff) {
|
||||
return (latin1Contains[c]);
|
||||
} else if (c <= 0x7ff) {
|
||||
return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
|
||||
} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
|
||||
int lead = c >> 12;
|
||||
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
|
||||
if (twoBits <= 1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
return (0 != twoBits);
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
|
||||
}
|
||||
} else if (c <= 0x10ffff) {
|
||||
// surrogate or supplementary code point
|
||||
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
|
||||
} else {
|
||||
// Out-of-range code points get false, consistent with long-standing
|
||||
// behavior of UnicodeSet.contains(c).
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Span the initial substring for which each character c has
|
||||
* spanCondition==contains(c). It must be spanCondition==0 or 1.
|
||||
*
|
||||
* @param start The start index
|
||||
* @param outCount If not null: Receives the number of code points in the span.
|
||||
* @return the limit (exclusive end) of the span
|
||||
*
|
||||
* NOTE: to reduce the overhead of function call to contains(c), it is
|
||||
* manually inlined here. Check for sufficient length for trail unit for
|
||||
* each surrogate pair. Handle single surrogates as surrogate code
|
||||
* points as usual in ICU.
|
||||
*/
|
||||
public final int span(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
|
||||
char c, c2;
|
||||
int i = start;
|
||||
int limit = s.length();
|
||||
int numSupplementary = 0;
|
||||
if (SpanCondition.NOT_CONTAINED != spanCondition) {
|
||||
// span
|
||||
while (i < limit) {
|
||||
c = s.charAt(i);
|
||||
if (c <= 0xff) {
|
||||
if (!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if (c <= 0x7ff) {
|
||||
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
|
||||
break;
|
||||
}
|
||||
} else if (c < 0xd800 || c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00
|
||||
|| c2 >= 0xe000) {
|
||||
int lead = c >> 12;
|
||||
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
|
||||
if (twoBits <= 1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if (twoBits == 0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
|
||||
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++numSupplementary;
|
||||
++i;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
} else {
|
||||
// span not
|
||||
while (i < limit) {
|
||||
c = s.charAt(i);
|
||||
if (c <= 0xff) {
|
||||
if (latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if (c <= 0x7ff) {
|
||||
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
|
||||
break;
|
||||
}
|
||||
} else if (c < 0xd800 || c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00
|
||||
|| c2 >= 0xe000) {
|
||||
int lead = c >> 12;
|
||||
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
|
||||
if (twoBits <= 1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if (twoBits != 0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
|
||||
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
++numSupplementary;
|
||||
++i;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
if (outCount != null) {
|
||||
int spanLength = i - start;
|
||||
outCount.value = spanLength - numSupplementary; // number of code points
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Symmetrical with span(). Span the trailing substring for which each character
|
||||
* c has spanCondition==contains(c). It must be s.length >= limit and
|
||||
* spanCondition==0 or 1.
|
||||
*
|
||||
* @return The string index which starts the span (i.e. inclusive).
|
||||
*/
|
||||
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
|
||||
char c, c2;
|
||||
|
||||
if (SpanCondition.NOT_CONTAINED != spanCondition) {
|
||||
// span
|
||||
for (;;) {
|
||||
c = s.charAt(--limit);
|
||||
if (c <= 0xff) {
|
||||
if (!latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if (c <= 0x7ff) {
|
||||
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
|
||||
break;
|
||||
}
|
||||
} else if (c < 0xd800 || c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800
|
||||
|| c2 >= 0xdc00) {
|
||||
int lead = c >> 12;
|
||||
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
|
||||
if (twoBits <= 1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if (twoBits == 0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
|
||||
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if (0 == limit) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// span not
|
||||
for (;;) {
|
||||
c = s.charAt(--limit);
|
||||
if (c <= 0xff) {
|
||||
if (latin1Contains[c]) {
|
||||
break;
|
||||
}
|
||||
} else if (c <= 0x7ff) {
|
||||
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
|
||||
break;
|
||||
}
|
||||
} else if (c < 0xd800 || c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800
|
||||
|| c2 >= 0xdc00) {
|
||||
int lead = c >> 12;
|
||||
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
|
||||
if (twoBits <= 1) {
|
||||
// All 64 code points with the same bits 15..6
|
||||
// are either in the set or not.
|
||||
if (twoBits != 0) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Look up the code point in its 4k block of code points.
|
||||
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// surrogate pair
|
||||
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
|
||||
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
|
||||
break;
|
||||
}
|
||||
--limit;
|
||||
}
|
||||
if (0 == limit) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return limit + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set bits in a bit rectangle in "vertical" bit organization.
|
||||
* start<limit<=0x800
|
||||
*/
|
||||
private static void set32x64Bits(int[] table, int start, int limit) {
|
||||
assert (64 == table.length);
|
||||
int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
|
||||
int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
|
||||
|
||||
// Set one bit indicating an all-one block.
|
||||
int bits = 1 << lead;
|
||||
if ((start + 1) == limit) { // Single-character shortcut.
|
||||
table[trail] |= bits;
|
||||
return;
|
||||
}
|
||||
|
||||
int limitLead = limit >> 6;
|
||||
int limitTrail = limit & 0x3f;
|
||||
|
||||
if (lead == limitLead) {
|
||||
// Partial vertical bit column.
|
||||
while (trail < limitTrail) {
|
||||
table[trail++] |= bits;
|
||||
}
|
||||
} else {
|
||||
// Partial vertical bit column,
|
||||
// followed by a bit rectangle,
|
||||
// followed by another partial vertical bit column.
|
||||
if (trail > 0) {
|
||||
do {
|
||||
table[trail++] |= bits;
|
||||
} while (trail < 64);
|
||||
++lead;
|
||||
}
|
||||
if (lead < limitLead) {
|
||||
bits = ~((1 << lead) - 1);
|
||||
if (limitLead < 0x20) {
|
||||
bits &= (1 << limitLead) - 1;
|
||||
}
|
||||
for (trail = 0; trail < 64; ++trail) {
|
||||
table[trail] |= bits;
|
||||
}
|
||||
}
|
||||
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
|
||||
// In that case, bits=1<<limitLead == 1<<0 == 1
|
||||
// (because Java << uses only the lower 5 bits of the shift operand)
|
||||
// but the bits value is not used because trail<limitTrail is already false.
|
||||
bits = 1 << limitLead;
|
||||
for (trail = 0; trail < limitTrail; ++trail) {
|
||||
table[trail] |= bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void initBits() {
|
||||
int start, limit;
|
||||
int listIndex = 0;
|
||||
|
||||
// Set latin1Contains[].
|
||||
do {
|
||||
start = list[listIndex++];
|
||||
if (listIndex < listLength) {
|
||||
limit = list[listIndex++];
|
||||
} else {
|
||||
limit = 0x110000;
|
||||
}
|
||||
if (start >= 0x100) {
|
||||
break;
|
||||
}
|
||||
do {
|
||||
latin1Contains[start++] = true;
|
||||
} while (start < limit && start < 0x100);
|
||||
} while (limit <= 0x100);
|
||||
|
||||
// Set table7FF[].
|
||||
while (start < 0x800) {
|
||||
set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
|
||||
if (limit > 0x800) {
|
||||
start = 0x800;
|
||||
break;
|
||||
}
|
||||
|
||||
start = list[listIndex++];
|
||||
if (listIndex < listLength) {
|
||||
limit = list[listIndex++];
|
||||
} else {
|
||||
limit = 0x110000;
|
||||
}
|
||||
}
|
||||
|
||||
// Set bmpBlockBits[].
|
||||
int minStart = 0x800;
|
||||
while (start < 0x10000) {
|
||||
if (limit > 0x10000) {
|
||||
limit = 0x10000;
|
||||
}
|
||||
|
||||
if (start < minStart) {
|
||||
start = minStart;
|
||||
}
|
||||
if (start < limit) { // Else: Another range entirely in a known mixed-value block.
|
||||
if (0 != (start & 0x3f)) {
|
||||
// Mixed-value block of 64 code points.
|
||||
start >>= 6;
|
||||
bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
|
||||
start = (start + 1) << 6; // Round up to the next block boundary.
|
||||
minStart = start; // Ignore further ranges in this block.
|
||||
}
|
||||
if (start < limit) {
|
||||
if (start < (limit & ~0x3f)) {
|
||||
// Multiple all-ones blocks of 64 code points each.
|
||||
set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
|
||||
}
|
||||
|
||||
if (0 != (limit & 0x3f)) {
|
||||
// Mixed-value block of 64 code points.
|
||||
limit >>= 6;
|
||||
bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
|
||||
limit = (limit + 1) << 6; // Round up to the next block boundary.
|
||||
minStart = limit; // Ignore further ranges in this block.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (limit == 0x10000) {
|
||||
break;
|
||||
}
|
||||
|
||||
start = list[listIndex++];
|
||||
if (listIndex < listLength) {
|
||||
limit = list[listIndex++];
|
||||
} else {
|
||||
limit = 0x110000;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is
|
||||
* restricted for finding code points in a certain range.
|
||||
*
|
||||
* For restricting the search for finding in the range start..end, pass in
|
||||
* lo=findCodePoint(start) and hi=findCodePoint(end) with 0<=lo<=hi<len.
|
||||
* findCodePoint(c) defaults to lo=0 and hi=len-1.
|
||||
*
|
||||
* @param c a character in a subrange of MIN_VALUE..MAX_VALUE
|
||||
* @param lo The lowest index to be returned.
|
||||
* @param hi The highest index to be returned.
|
||||
* @return the smallest integer i in the range lo..hi, inclusive, such that c <
|
||||
* list[i]
|
||||
*/
|
||||
private int findCodePoint(int c, int lo, int hi) {
|
||||
/*
|
||||
* Examples: findCodePoint(c) set list[] c=0 1 3 4 7 8 === ==============
|
||||
* =========== [] [110000] 0 0 0 0 0 0 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2
|
||||
* 2 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 [:Any:] [0, 110000] 1 1 1 1 1 1
|
||||
*/
|
||||
|
||||
// Return the smallest i such that c < list[i]. Assume
|
||||
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
if (c < list[lo])
|
||||
return lo;
|
||||
// High runner test. c is often after the last range, so an
|
||||
// initial check for this condition pays off.
|
||||
if (lo >= hi || c >= list[hi - 1])
|
||||
return hi;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
int i = (lo + hi) >>> 1;
|
||||
if (i == lo) {
|
||||
break; // Found!
|
||||
} else if (c < list[i]) {
|
||||
hi = i;
|
||||
} else {
|
||||
lo = i;
|
||||
}
|
||||
}
|
||||
return hi;
|
||||
}
|
||||
|
||||
private final boolean containsSlow(int c, int lo, int hi) {
|
||||
return (0 != (findCodePoint(c, lo, hi) & 1));
|
||||
}
|
||||
}
|
175
sources/main/java/jdk_internal/icu/impl/CharTrie.java
Normal file
175
sources/main/java/jdk_internal/icu/impl/CharTrie.java
Normal file
@ -0,0 +1,175 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Trie implementation which stores data in char, 16 bits.
|
||||
*
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.Trie
|
||||
* @since release 2.1, Jan 01 2002
|
||||
*/
|
||||
|
||||
// note that i need to handle the block calculations later, since chartrie
|
||||
// in icu4c uses the same index array.
|
||||
public class CharTrie extends Trie {
|
||||
// public constructors ---------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Creates a new Trie with the settings for the trie data.
|
||||
* </p>
|
||||
* <p>
|
||||
* Unserialize the 32-bit-aligned input stream and use the data for the trie.
|
||||
* </p>
|
||||
*
|
||||
* @param inputStream file input stream to a ICU data file, containing the
|
||||
* trie
|
||||
* @param dataManipulate object which provides methods to parse the char data
|
||||
* @throws IOException thrown when data reading fails
|
||||
* @draft 2.1
|
||||
*/
|
||||
public CharTrie(InputStream inputStream, DataManipulate dataManipulate) throws IOException {
|
||||
super(inputStream, dataManipulate);
|
||||
|
||||
if (!isCharTrie()) {
|
||||
throw new IllegalArgumentException("Data given does not belong to a char trie.");
|
||||
}
|
||||
}
|
||||
|
||||
// public methods --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the value associated with the codepoint. If no value is associated with
|
||||
* the codepoint, a default value will be returned.
|
||||
*
|
||||
* @param ch codepoint
|
||||
* @return offset to data
|
||||
*/
|
||||
public final char getCodePointValue(int ch) {
|
||||
int offset;
|
||||
|
||||
// fastpath for U+0000..U+D7FF
|
||||
if (0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
// copy of getRawOffset()
|
||||
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_) + (ch & INDEX_STAGE_3_MASK_);
|
||||
return m_data_[offset];
|
||||
}
|
||||
|
||||
// handle U+D800..U+10FFFF
|
||||
offset = getCodePointOffset(ch);
|
||||
|
||||
// return -1 if there is an error, in this case we return the default
|
||||
// value: m_initialValue_
|
||||
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value to the data which this lead surrogate character points to.
|
||||
* Returned data may contain folding offset information for the next trailing
|
||||
* surrogate character. This method does not guarantee correct results for trail
|
||||
* surrogates.
|
||||
*
|
||||
* @param ch lead surrogate character
|
||||
* @return data value
|
||||
*/
|
||||
public final char getLeadValue(char ch) {
|
||||
return m_data_[getLeadOffset(ch)];
|
||||
}
|
||||
|
||||
// protected methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Parses the input stream and stores its trie content into a index and data
|
||||
* array
|
||||
* </p>
|
||||
*
|
||||
* @param inputStream data input stream containing trie data
|
||||
* @exception IOException thrown when data reading fails
|
||||
*/
|
||||
protected final void unserialize(InputStream inputStream) throws IOException {
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
int indexDataLength = m_dataOffset_ + m_dataLength_;
|
||||
m_index_ = new char[indexDataLength];
|
||||
for (int i = 0; i < indexDataLength; i++) {
|
||||
m_index_[i] = input.readChar();
|
||||
}
|
||||
m_data_ = m_index_;
|
||||
m_initialValue_ = m_data_[m_dataOffset_];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the surrogate pair points to.
|
||||
*
|
||||
* @param lead lead surrogate
|
||||
* @param trail trailing surrogate
|
||||
* @return offset to data
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getSurrogateOffset(char lead, char trail) {
|
||||
if (m_dataManipulate_ == null) {
|
||||
throw new NullPointerException("The field DataManipulate in this Trie is null");
|
||||
}
|
||||
|
||||
// get fold position for the next trail surrogate
|
||||
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
|
||||
|
||||
// get the real data from the folded lead/trail units
|
||||
if (offset > 0) {
|
||||
return getRawOffset(offset, (char) (trail & SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
// return -1 if there is an error, in this case we return the default
|
||||
// value: m_initialValue_
|
||||
return -1;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Default value
|
||||
*/
|
||||
private char m_initialValue_;
|
||||
/**
|
||||
* Array of char data
|
||||
*/
|
||||
private char m_data_[];
|
||||
}
|
@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.icu.text.UCharacterIterator;
|
||||
|
||||
/**
|
||||
* This class is a wrapper around CharacterIterator and implements the
|
||||
* UCharacterIterator protocol
|
||||
*
|
||||
* @author ram
|
||||
*/
|
||||
|
||||
public class CharacterIteratorWrapper extends UCharacterIterator {
|
||||
|
||||
private CharacterIterator iterator;
|
||||
|
||||
public CharacterIteratorWrapper(CharacterIterator iter) {
|
||||
if (iter == null) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
iterator = iter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#current()
|
||||
*/
|
||||
public int current() {
|
||||
int c = iterator.current();
|
||||
if (c == CharacterIterator.DONE) {
|
||||
return DONE;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getLength()
|
||||
*/
|
||||
public int getLength() {
|
||||
return (iterator.getEndIndex() - iterator.getBeginIndex());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getIndex()
|
||||
*/
|
||||
public int getIndex() {
|
||||
return iterator.getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#next()
|
||||
*/
|
||||
public int next() {
|
||||
int i = iterator.current();
|
||||
iterator.next();
|
||||
if (i == CharacterIterator.DONE) {
|
||||
return DONE;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#previous()
|
||||
*/
|
||||
public int previous() {
|
||||
int i = iterator.previous();
|
||||
if (i == CharacterIterator.DONE) {
|
||||
return DONE;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#setIndex(int)
|
||||
*/
|
||||
public void setIndex(int index) {
|
||||
iterator.setIndex(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getText(char[])
|
||||
*/
|
||||
public int getText(char[] fillIn, int offset) {
|
||||
int length = iterator.getEndIndex() - iterator.getBeginIndex();
|
||||
int currentIndex = iterator.getIndex();
|
||||
if (offset < 0 || offset + length > fillIn.length) {
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
|
||||
for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) {
|
||||
fillIn[offset++] = ch;
|
||||
}
|
||||
iterator.setIndex(currentIndex);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a clone of this iterator. Clones the underlying character iterator.
|
||||
*
|
||||
* @see UCharacterIterator#clone()
|
||||
*/
|
||||
public Object clone() {
|
||||
try {
|
||||
CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone();
|
||||
result.iterator = (CharacterIterator) this.iterator.clone();
|
||||
return result;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // only invoked if bad underlying character iterator
|
||||
}
|
||||
}
|
||||
}
|
303
sources/main/java/jdk_internal/icu/impl/ICUBinary.java
Normal file
303
sources/main/java/jdk_internal/icu/impl/ICUBinary.java
Normal file
@ -0,0 +1,303 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Arrays;
|
||||
|
||||
import jdk_internal.icu.util.VersionInfo;
|
||||
import net.lax1dude.eaglercraft.v1_8.EagRuntime;
|
||||
|
||||
public final class ICUBinary {
|
||||
|
||||
private static final class IsAcceptable implements Authenticate {
|
||||
@Override
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == 1;
|
||||
}
|
||||
}
|
||||
|
||||
// public inner interface ------------------------------------------------
|
||||
|
||||
/**
|
||||
* Special interface for data authentication
|
||||
*/
|
||||
public static interface Authenticate {
|
||||
/**
|
||||
* Method used in ICUBinary.readHeader() to provide data format authentication.
|
||||
*
|
||||
* @param version version of the current data
|
||||
* @return true if dataformat is an acceptable version, false otherwise
|
||||
*/
|
||||
public boolean isDataVersionAcceptable(byte version[]);
|
||||
}
|
||||
|
||||
// public methods --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Loads an ICU binary data file and returns it as a ByteBuffer. The buffer
|
||||
* contents is normally read-only, but its position etc. can be modified.
|
||||
*
|
||||
* @param itemPath Relative ICU data item path, for example "root.res" or
|
||||
* "coll/ucadata.icu".
|
||||
* @return The data as a read-only ByteBuffer.
|
||||
*/
|
||||
public static ByteBuffer getRequiredData(String itemPath) {
|
||||
try (InputStream is = EagRuntime.getRequiredResourceStream(itemPath)) {
|
||||
|
||||
// is.available() may return 0, or 1, or the total number of bytes in the
|
||||
// stream,
|
||||
// or some other number.
|
||||
// Do not try to use is.available() == 0 to find the end of the stream!
|
||||
byte[] bytes;
|
||||
int avail = is.available();
|
||||
if (avail > 32) {
|
||||
// There are more bytes available than just the ICU data header length.
|
||||
// With luck, it is the total number of bytes.
|
||||
bytes = new byte[avail];
|
||||
} else {
|
||||
bytes = new byte[128]; // empty .res files are even smaller
|
||||
}
|
||||
// Call is.read(...) until one returns a negative value.
|
||||
int length = 0;
|
||||
for (;;) {
|
||||
if (length < bytes.length) {
|
||||
int numRead = is.read(bytes, length, bytes.length - length);
|
||||
if (numRead < 0) {
|
||||
break; // end of stream
|
||||
}
|
||||
length += numRead;
|
||||
} else {
|
||||
// See if we are at the end of the stream before we grow the array.
|
||||
int nextByte = is.read();
|
||||
if (nextByte < 0) {
|
||||
break;
|
||||
}
|
||||
int capacity = 2 * bytes.length;
|
||||
if (capacity < 128) {
|
||||
capacity = 128;
|
||||
} else if (capacity < 0x4000) {
|
||||
capacity *= 2; // Grow faster until we reach 16kB.
|
||||
}
|
||||
bytes = Arrays.copyOf(bytes, capacity);
|
||||
bytes[length++] = (byte) nextByte;
|
||||
}
|
||||
}
|
||||
return ByteBuffer.wrap(bytes, 0, length);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as readHeader(), but returns a VersionInfo rather than a compact int.
|
||||
*/
|
||||
public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes, int dataFormat, Authenticate authenticate)
|
||||
throws IOException {
|
||||
return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate));
|
||||
}
|
||||
|
||||
private static final byte BIG_ENDIAN_ = 1;
|
||||
|
||||
public static final byte[] readHeader(InputStream inputStream, byte dataFormatIDExpected[],
|
||||
Authenticate authenticate) throws IOException {
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
char headersize = input.readChar();
|
||||
int readcount = 2;
|
||||
// reading the header format
|
||||
byte magic1 = input.readByte();
|
||||
readcount++;
|
||||
byte magic2 = input.readByte();
|
||||
readcount++;
|
||||
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
|
||||
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
|
||||
input.readChar(); // reading size
|
||||
readcount += 2;
|
||||
input.readChar(); // reading reserved word
|
||||
readcount += 2;
|
||||
byte bigendian = input.readByte();
|
||||
readcount++;
|
||||
byte charset = input.readByte();
|
||||
readcount++;
|
||||
byte charsize = input.readByte();
|
||||
readcount++;
|
||||
input.readByte(); // reading reserved byte
|
||||
readcount++;
|
||||
|
||||
byte dataFormatID[] = new byte[4];
|
||||
input.readFully(dataFormatID);
|
||||
readcount += 4;
|
||||
byte dataVersion[] = new byte[4];
|
||||
input.readFully(dataVersion);
|
||||
readcount += 4;
|
||||
byte unicodeVersion[] = new byte[4];
|
||||
input.readFully(unicodeVersion);
|
||||
readcount += 4;
|
||||
if (headersize < readcount) {
|
||||
throw new IOException("Internal Error: Header size error");
|
||||
}
|
||||
input.skipBytes(headersize - readcount);
|
||||
|
||||
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_ || charsize != CHAR_SIZE_
|
||||
|| !Arrays.equals(dataFormatIDExpected, dataFormatID)
|
||||
|| (authenticate != null && !authenticate.isDataVersionAcceptable(dataVersion))) {
|
||||
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
return unicodeVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads an ICU data header, checks the data format, and returns the data
|
||||
* version.
|
||||
*
|
||||
* <p>
|
||||
* Assumes that the ByteBuffer position is 0 on input. The buffer byte order is
|
||||
* set according to the data. The buffer position is advanced past the header
|
||||
* (including UDataInfo and comment).
|
||||
*
|
||||
* <p>
|
||||
* See C++ ucmndata.h and unicode/udata.h.
|
||||
*
|
||||
* @return dataVersion
|
||||
* @throws IOException if this is not a valid ICU data item of the expected
|
||||
* dataFormat
|
||||
*/
|
||||
public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate) throws IOException {
|
||||
assert bytes.position() == 0;
|
||||
byte magic1 = bytes.get(2);
|
||||
byte magic2 = bytes.get(3);
|
||||
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
|
||||
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
|
||||
byte isBigEndian = bytes.get(8);
|
||||
byte charsetFamily = bytes.get(9);
|
||||
byte sizeofUChar = bytes.get(10);
|
||||
if (isBigEndian < 0 || 1 < isBigEndian || charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) {
|
||||
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
int headerSize = bytes.getChar(0);
|
||||
int sizeofUDataInfo = bytes.getChar(4);
|
||||
if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) {
|
||||
throw new IOException("Internal Error: Header size error");
|
||||
}
|
||||
// TODO: Change Authenticate to take int major, int minor, int milli, int micro
|
||||
// to avoid array allocation.
|
||||
byte[] formatVersion = new byte[] { bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19) };
|
||||
if (bytes.get(12) != (byte) (dataFormat >> 24) || bytes.get(13) != (byte) (dataFormat >> 16)
|
||||
|| bytes.get(14) != (byte) (dataFormat >> 8) || bytes.get(15) != (byte) dataFormat
|
||||
|| (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) {
|
||||
throw new IOException(HEADER_AUTHENTICATION_FAILED_
|
||||
+ String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d", bytes.get(12),
|
||||
bytes.get(13), bytes.get(14), bytes.get(15), formatVersion[0] & 0xff,
|
||||
formatVersion[1] & 0xff, formatVersion[2] & 0xff, formatVersion[3] & 0xff));
|
||||
}
|
||||
|
||||
bytes.position(headerSize);
|
||||
return // dataVersion
|
||||
((int) bytes.get(20) << 24) | ((bytes.get(21) & 0xff) << 16) | ((bytes.get(22) & 0xff) << 8)
|
||||
| (bytes.get(23) & 0xff);
|
||||
}
|
||||
|
||||
public static void skipBytes(ByteBuffer bytes, int skipLength) {
|
||||
if (skipLength > 0) {
|
||||
bytes.position(bytes.position() + skipLength);
|
||||
}
|
||||
}
|
||||
|
||||
public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
byte[] dest = new byte[length];
|
||||
bytes.get(dest);
|
||||
if (additionalSkipLength > 0) {
|
||||
skipBytes(bytes, additionalSkipLength);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
CharSequence cs = bytes.asCharBuffer();
|
||||
String s = cs.subSequence(0, length).toString();
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return s;
|
||||
}
|
||||
|
||||
public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
char[] dest = new char[length];
|
||||
bytes.asCharBuffer().get(dest);
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
int[] dest = new int[length];
|
||||
bytes.asIntBuffer().get(dest);
|
||||
skipBytes(bytes, length * 4 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a VersionInfo for the bytes in the compact version integer.
|
||||
*/
|
||||
public static VersionInfo getVersionInfoFromCompactInt(int version) {
|
||||
return VersionInfo.getInstance(version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
|
||||
}
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Magic numbers to authenticate the data file
|
||||
*/
|
||||
private static final byte MAGIC1 = (byte) 0xda;
|
||||
private static final byte MAGIC2 = (byte) 0x27;
|
||||
|
||||
/**
|
||||
* File format authentication values
|
||||
*/
|
||||
private static final byte CHAR_SET_ = 0;
|
||||
private static final byte CHAR_SIZE_ = 2;
|
||||
|
||||
/**
|
||||
* Error messages
|
||||
*/
|
||||
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ = "ICUBinary data file error: Magic number authentication failed";
|
||||
private static final String HEADER_AUTHENTICATION_FAILED_ = "ICUBinary data file error: Header authentication failed";
|
||||
}
|
296
sources/main/java/jdk_internal/icu/impl/Norm2AllModes.java
Normal file
296
sources/main/java/jdk_internal/icu/impl/Norm2AllModes.java
Normal file
@ -0,0 +1,296 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import jdk_internal.icu.text.Normalizer2;
|
||||
|
||||
public final class Norm2AllModes {
|
||||
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
|
||||
|
||||
// Normalizer2 implementation for the old UNORM_NONE.
|
||||
public static final class NoopNormalizer2 extends Normalizer2 {
|
||||
@Override
|
||||
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
|
||||
if (dest != src) {
|
||||
dest.setLength(0);
|
||||
return dest.append(src);
|
||||
} else {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Appendable normalize(CharSequence src, Appendable dest) {
|
||||
if (dest != src) {
|
||||
try {
|
||||
return dest.append(src);
|
||||
} catch (IOException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
|
||||
if (first != second) {
|
||||
return first.append(second);
|
||||
} else {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder append(StringBuilder first, CharSequence second) {
|
||||
if (first != second) {
|
||||
return first.append(second);
|
||||
} else {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDecomposition(int c) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// No need to override the default getRawDecomposition().
|
||||
@Override
|
||||
public boolean isNormalized(CharSequence s) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int spanQuickCheckYes(CharSequence s) {
|
||||
return s.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Intermediate class:
|
||||
// Has NormalizerImpl and does boilerplate argument checking and setup.
|
||||
public abstract static class Normalizer2WithImpl extends Normalizer2 {
|
||||
public Normalizer2WithImpl(NormalizerImpl ni) {
|
||||
impl = ni;
|
||||
}
|
||||
|
||||
// normalize
|
||||
@Override
|
||||
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
dest.setLength(0);
|
||||
normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()));
|
||||
return dest;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Appendable normalize(CharSequence src, Appendable dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
NormalizerImpl.ReorderingBuffer buffer = new NormalizerImpl.ReorderingBuffer(impl, dest, src.length());
|
||||
normalize(src, buffer);
|
||||
buffer.flush();
|
||||
return dest;
|
||||
}
|
||||
|
||||
protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer);
|
||||
|
||||
// normalize and append
|
||||
@Override
|
||||
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder append(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, false);
|
||||
}
|
||||
|
||||
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, boolean doNormalize) {
|
||||
if (first == second) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
normalizeAndAppend(second, doNormalize,
|
||||
new NormalizerImpl.ReorderingBuffer(impl, first, first.length() + second.length()));
|
||||
return first;
|
||||
}
|
||||
|
||||
protected abstract void normalizeAndAppend(CharSequence src, boolean doNormalize,
|
||||
NormalizerImpl.ReorderingBuffer buffer);
|
||||
|
||||
@Override
|
||||
public String getDecomposition(int c) {
|
||||
return impl.getDecomposition(c);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCombiningClass(int c) {
|
||||
return impl.getCC(impl.getNorm16(c));
|
||||
}
|
||||
|
||||
// quick checks
|
||||
@Override
|
||||
public boolean isNormalized(CharSequence s) {
|
||||
return s.length() == spanQuickCheckYes(s);
|
||||
}
|
||||
|
||||
public final NormalizerImpl impl;
|
||||
}
|
||||
|
||||
public static final class DecomposeNormalizer2 extends Normalizer2WithImpl {
|
||||
public DecomposeNormalizer2(NormalizerImpl ni) {
|
||||
super(ni);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
|
||||
impl.decompose(src, 0, src.length(), buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void normalizeAndAppend(CharSequence src, boolean doNormalize,
|
||||
NormalizerImpl.ReorderingBuffer buffer) {
|
||||
impl.decomposeAndAppend(src, doNormalize, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int spanQuickCheckYes(CharSequence s) {
|
||||
return impl.decompose(s, 0, s.length(), null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) {
|
||||
return impl.hasDecompBoundaryBefore(c);
|
||||
}
|
||||
}
|
||||
|
||||
public static final class ComposeNormalizer2 extends Normalizer2WithImpl {
|
||||
public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) {
|
||||
super(ni);
|
||||
onlyContiguous = fcc;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
|
||||
impl.compose(src, 0, src.length(), onlyContiguous, true, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void normalizeAndAppend(CharSequence src, boolean doNormalize,
|
||||
NormalizerImpl.ReorderingBuffer buffer) {
|
||||
impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNormalized(CharSequence s) {
|
||||
// 5: small destCapacity for substring normalization
|
||||
return impl.compose(s, 0, s.length(), onlyContiguous, false,
|
||||
new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int spanQuickCheckYes(CharSequence s) {
|
||||
return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true) >>> 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) {
|
||||
return impl.hasCompBoundaryBefore(c);
|
||||
}
|
||||
|
||||
private final boolean onlyContiguous;
|
||||
}
|
||||
|
||||
// instance cache ---------------------------------------------------------- ***
|
||||
|
||||
private Norm2AllModes(NormalizerImpl ni) {
|
||||
impl = ni;
|
||||
comp = new ComposeNormalizer2(ni, false);
|
||||
decomp = new DecomposeNormalizer2(ni);
|
||||
}
|
||||
|
||||
public final NormalizerImpl impl;
|
||||
public final ComposeNormalizer2 comp;
|
||||
public final DecomposeNormalizer2 decomp;
|
||||
|
||||
private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) {
|
||||
if (singleton.exception != null) {
|
||||
throw singleton.exception;
|
||||
}
|
||||
return singleton.allModes;
|
||||
}
|
||||
|
||||
public static Norm2AllModes getNFCInstance() {
|
||||
return getInstanceFromSingleton(NFCSingleton.INSTANCE);
|
||||
}
|
||||
|
||||
public static Norm2AllModes getNFKCInstance() {
|
||||
return getInstanceFromSingleton(NFKCSingleton.INSTANCE);
|
||||
}
|
||||
|
||||
public static final NoopNormalizer2 NOOP_NORMALIZER2 = new NoopNormalizer2();
|
||||
|
||||
private static final class Norm2AllModesSingleton {
|
||||
private Norm2AllModesSingleton(String name) {
|
||||
try {
|
||||
@SuppressWarnings("deprecation")
|
||||
String DATA_FILE_NAME = "/assets/eagler/icudt/" + name + ".nrm";
|
||||
NormalizerImpl impl = new NormalizerImpl().load(DATA_FILE_NAME);
|
||||
allModes = new Norm2AllModes(impl);
|
||||
} catch (RuntimeException e) {
|
||||
exception = e;
|
||||
}
|
||||
}
|
||||
|
||||
private Norm2AllModes allModes;
|
||||
private RuntimeException exception;
|
||||
}
|
||||
|
||||
private static final class NFCSingleton {
|
||||
private static final Norm2AllModesSingleton INSTANCE = new Norm2AllModesSingleton("nfc");
|
||||
}
|
||||
|
||||
private static final class NFKCSingleton {
|
||||
private static final Norm2AllModesSingleton INSTANCE = new Norm2AllModesSingleton("nfkc");
|
||||
}
|
||||
}
|
2261
sources/main/java/jdk_internal/icu/impl/NormalizerImpl.java
Normal file
2261
sources/main/java/jdk_internal/icu/impl/NormalizerImpl.java
Normal file
File diff suppressed because it is too large
Load Diff
500
sources/main/java/jdk_internal/icu/impl/Punycode.java
Normal file
500
sources/main/java/jdk_internal/icu/impl/Punycode.java
Normal file
@ -0,0 +1,500 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2004, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
//
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
|
||||
// - move from package com.ibm.icu.text to package sun.net.idn
|
||||
// - use ParseException instead of StringPrepParseException
|
||||
// 2007-08-14 Martin Buchholz
|
||||
// - remove redundant casts
|
||||
//
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.text.ParseException;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* Ported code from ICU punycode.c
|
||||
*
|
||||
* @author ram
|
||||
*/
|
||||
|
||||
/* Package Private class */
|
||||
public final class Punycode {
|
||||
|
||||
/* Punycode parameters for Bootstring */
|
||||
private static final int BASE = 36;
|
||||
private static final int TMIN = 1;
|
||||
private static final int TMAX = 26;
|
||||
private static final int SKEW = 38;
|
||||
private static final int DAMP = 700;
|
||||
private static final int INITIAL_BIAS = 72;
|
||||
private static final int INITIAL_N = 0x80;
|
||||
|
||||
/* "Basic" Unicode/ASCII code points */
|
||||
private static final int HYPHEN = 0x2d;
|
||||
private static final int DELIMITER = HYPHEN;
|
||||
|
||||
private static final int ZERO = 0x30;
|
||||
private static final int NINE = 0x39;
|
||||
|
||||
private static final int SMALL_A = 0x61;
|
||||
private static final int SMALL_Z = 0x7a;
|
||||
|
||||
private static final int CAPITAL_A = 0x41;
|
||||
private static final int CAPITAL_Z = 0x5a;
|
||||
|
||||
// TODO: eliminate the 256 limitation
|
||||
private static final int MAX_CP_COUNT = 256;
|
||||
|
||||
private static final int UINT_MAGIC = 0x80000000;
|
||||
private static final long ULONG_MAGIC = 0x8000000000000000L;
|
||||
|
||||
private static int adaptBias(int delta, int length, boolean firstTime) {
|
||||
if (firstTime) {
|
||||
delta /= DAMP;
|
||||
} else {
|
||||
delta /= 2;
|
||||
}
|
||||
delta += delta / length;
|
||||
|
||||
int count = 0;
|
||||
for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) {
|
||||
delta /= (BASE - TMIN);
|
||||
}
|
||||
|
||||
return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
|
||||
}
|
||||
|
||||
/**
|
||||
* basicToDigit[] contains the numeric value of a basic code point (for use in
|
||||
* representing integers) in the range 0 to BASE-1, or -1 if b is does not
|
||||
* represent a value.
|
||||
*/
|
||||
static final int[] basicToDigit = new int[] { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1,
|
||||
-1, -1, -1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1,
|
||||
-1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1,
|
||||
-1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1 };
|
||||
|
||||
private static char asciiCaseMap(char b, boolean uppercase) {
|
||||
if (uppercase) {
|
||||
if (SMALL_A <= b && b <= SMALL_Z) {
|
||||
b -= (SMALL_A - CAPITAL_A);
|
||||
}
|
||||
} else {
|
||||
if (CAPITAL_A <= b && b <= CAPITAL_Z) {
|
||||
b += (SMALL_A - CAPITAL_A);
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
/**
|
||||
* digitToBasic() returns the basic code point whose value (when used for
|
||||
* representing integers) is d, which must be in the range 0 to BASE-1. The
|
||||
* lowercase form is used unless the uppercase flag is nonzero, in which case
|
||||
* the uppercase form is used.
|
||||
*/
|
||||
private static char digitToBasic(int digit, boolean uppercase) {
|
||||
/* 0..25 map to ASCII a..z or A..Z */
|
||||
/* 26..35 map to ASCII 0..9 */
|
||||
if (digit < 26) {
|
||||
if (uppercase) {
|
||||
return (char) (CAPITAL_A + digit);
|
||||
} else {
|
||||
return (char) (SMALL_A + digit);
|
||||
}
|
||||
} else {
|
||||
return (char) ((ZERO - 26) + digit);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts Unicode to Punycode. The input string must not contain single,
|
||||
* unpaired surrogates. The output will be represented as an array of ASCII code
|
||||
* points.
|
||||
*
|
||||
* @param src
|
||||
* @param caseFlags
|
||||
* @return
|
||||
* @throws ParseException
|
||||
*/
|
||||
public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException {
|
||||
|
||||
int[] cpBuffer = new int[MAX_CP_COUNT];
|
||||
int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
|
||||
char c, c2;
|
||||
int srcLength = src.length();
|
||||
int destCapacity = MAX_CP_COUNT;
|
||||
char[] dest = new char[destCapacity];
|
||||
StringBuffer result = new StringBuffer();
|
||||
/*
|
||||
* Handle the basic code points and convert extended ones to UTF-32 in cpBuffer
|
||||
* (caseFlag in sign bit):
|
||||
*/
|
||||
srcCPCount = destLength = 0;
|
||||
|
||||
for (j = 0; j < srcLength; ++j) {
|
||||
if (srcCPCount == MAX_CP_COUNT) {
|
||||
/* too many input code points */
|
||||
throw new ParseException("Too many input code points", -1);
|
||||
}
|
||||
c = src.charAt(j);
|
||||
if (isBasic(c)) {
|
||||
if (destLength < destCapacity) {
|
||||
cpBuffer[srcCPCount++] = 0;
|
||||
dest[destLength] = caseFlags != null ? asciiCaseMap(c, caseFlags[j]) : c;
|
||||
}
|
||||
++destLength;
|
||||
} else {
|
||||
n = ((caseFlags != null && caseFlags[j]) ? 1 : 0) << 31L;
|
||||
if (!UTF16.isSurrogate(c)) {
|
||||
n |= c;
|
||||
} else if (UTF16.isLeadSurrogate(c) && (j + 1) < srcLength
|
||||
&& UTF16.isTrailSurrogate(c2 = src.charAt(j + 1))) {
|
||||
++j;
|
||||
|
||||
n |= UCharacter.getCodePoint(c, c2);
|
||||
} else {
|
||||
/* error: unmatched surrogate */
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
cpBuffer[srcCPCount++] = n;
|
||||
}
|
||||
}
|
||||
|
||||
/* Finish the basic string - if it is not empty - with a delimiter. */
|
||||
basicLength = destLength;
|
||||
if (basicLength > 0) {
|
||||
if (destLength < destCapacity) {
|
||||
dest[destLength] = DELIMITER;
|
||||
}
|
||||
++destLength;
|
||||
}
|
||||
|
||||
/*
|
||||
* handledCPCount is the number of code points that have been handled
|
||||
* basicLength is the number of basic code points destLength is the number of
|
||||
* chars that have been output
|
||||
*/
|
||||
|
||||
/* Initialize the state: */
|
||||
n = INITIAL_N;
|
||||
delta = 0;
|
||||
bias = INITIAL_BIAS;
|
||||
|
||||
/* Main encoding loop: */
|
||||
for (handledCPCount = basicLength; handledCPCount < srcCPCount; /* no op */) {
|
||||
/*
|
||||
* All non-basic code points < n have been handled already. Find the next larger
|
||||
* one:
|
||||
*/
|
||||
for (m = 0x7fffffff, j = 0; j < srcCPCount; ++j) {
|
||||
q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
|
||||
if (n <= q && q < m) {
|
||||
m = q;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase delta enough to advance the decoder's <n,i> state to <m,0>, but
|
||||
* guard against overflow:
|
||||
*/
|
||||
if (m - n > (0x7fffffff - MAX_CP_COUNT - delta) / (handledCPCount + 1)) {
|
||||
throw new RuntimeException("Internal program error");
|
||||
}
|
||||
delta += (m - n) * (handledCPCount + 1);
|
||||
n = m;
|
||||
|
||||
/* Encode a sequence of same code points n */
|
||||
for (j = 0; j < srcCPCount; ++j) {
|
||||
q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
|
||||
if (q < n) {
|
||||
++delta;
|
||||
} else if (q == n) {
|
||||
/* Represent delta as a generalized variable-length integer: */
|
||||
for (q = delta, k = BASE; /* no condition */; k += BASE) {
|
||||
|
||||
/**
|
||||
* RAM: comment out the old code for conformance with
|
||||
* draft-ietf-idn-punycode-03.txt
|
||||
*
|
||||
* t=k-bias; if(t<TMIN) { t=TMIN; } else if(t>TMAX) { t=TMAX; }
|
||||
*/
|
||||
|
||||
t = k - bias;
|
||||
if (t < TMIN) {
|
||||
t = TMIN;
|
||||
} else if (k >= (bias + TMAX)) {
|
||||
t = TMAX;
|
||||
}
|
||||
|
||||
if (q < t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (destLength < destCapacity) {
|
||||
dest[destLength++] = digitToBasic(t + (q - t) % (BASE - t), false);
|
||||
}
|
||||
q = (q - t) / (BASE - t);
|
||||
}
|
||||
|
||||
if (destLength < destCapacity) {
|
||||
dest[destLength++] = digitToBasic(q, (cpBuffer[j] < 0));
|
||||
}
|
||||
bias = adaptBias(delta, handledCPCount + 1, (handledCPCount == basicLength));
|
||||
delta = 0;
|
||||
++handledCPCount;
|
||||
}
|
||||
}
|
||||
|
||||
++delta;
|
||||
++n;
|
||||
}
|
||||
|
||||
return result.append(dest, 0, destLength);
|
||||
}
|
||||
|
||||
private static boolean isBasic(int ch) {
|
||||
return (ch < INITIAL_N);
|
||||
}
|
||||
|
||||
private static boolean isBasicUpperCase(int ch) {
|
||||
return (CAPITAL_A <= ch && ch <= CAPITAL_Z);
|
||||
}
|
||||
|
||||
private static boolean isSurrogate(int ch) {
|
||||
return (((ch) & 0xfffff800) == 0xd800);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts Punycode to Unicode. The Unicode string will be at most as long as
|
||||
* the Punycode string.
|
||||
*
|
||||
* @param src
|
||||
* @param caseFlags
|
||||
* @return
|
||||
* @throws ParseException
|
||||
*/
|
||||
public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) throws ParseException {
|
||||
int srcLength = src.length();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, destCPCount, firstSupplementaryIndex,
|
||||
cpLength;
|
||||
char b;
|
||||
int destCapacity = MAX_CP_COUNT;
|
||||
char[] dest = new char[destCapacity];
|
||||
|
||||
/*
|
||||
* Handle the basic code points: Let basicLength be the number of input code
|
||||
* points before the last delimiter, or 0 if there is none, then copy the first
|
||||
* basicLength code points to the output.
|
||||
*
|
||||
* The two following loops iterate backward.
|
||||
*/
|
||||
for (j = srcLength; j > 0;) {
|
||||
if (src.charAt(--j) == DELIMITER) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
destLength = basicLength = destCPCount = j;
|
||||
|
||||
while (j > 0) {
|
||||
b = src.charAt(--j);
|
||||
if (!isBasic(b)) {
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
if (j < destCapacity) {
|
||||
dest[j] = b;
|
||||
|
||||
if (caseFlags != null) {
|
||||
caseFlags[j] = isBasicUpperCase(b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialize the state: */
|
||||
n = INITIAL_N;
|
||||
i = 0;
|
||||
bias = INITIAL_BIAS;
|
||||
firstSupplementaryIndex = 1000000000;
|
||||
|
||||
/*
|
||||
* Main decoding loop: Start just after the last delimiter if any basic code
|
||||
* points were copied; start at the beginning otherwise.
|
||||
*/
|
||||
for (in = basicLength > 0 ? basicLength + 1 : 0; in < srcLength; /* no op */) {
|
||||
/*
|
||||
* in is the index of the next character to be consumed, and destCPCount is the
|
||||
* number of code points in the output array.
|
||||
*
|
||||
* Decode a generalized variable-length integer into delta, which gets added to
|
||||
* i. The overflow checking is easier if we increase i as we go, then subtract
|
||||
* off its starting value at the end to obtain delta.
|
||||
*/
|
||||
for (oldi = i, w = 1, k = BASE; /* no condition */; k += BASE) {
|
||||
if (in >= srcLength) {
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
digit = basicToDigit[(byte) src.charAt(in++)];
|
||||
if (digit < 0) {
|
||||
throw new ParseException("Invalid char found", -1);
|
||||
}
|
||||
if (digit > (0x7fffffff - i) / w) {
|
||||
/* integer overflow */
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
i += digit * w;
|
||||
t = k - bias;
|
||||
if (t < TMIN) {
|
||||
t = TMIN;
|
||||
} else if (k >= (bias + TMAX)) {
|
||||
t = TMAX;
|
||||
}
|
||||
if (digit < t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (w > 0x7fffffff / (BASE - t)) {
|
||||
/* integer overflow */
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
w *= BASE - t;
|
||||
}
|
||||
|
||||
/*
|
||||
* Modification from sample code: Increments destCPCount here, where needed
|
||||
* instead of in for() loop tail.
|
||||
*/
|
||||
++destCPCount;
|
||||
bias = adaptBias(i - oldi, destCPCount, (oldi == 0));
|
||||
|
||||
/*
|
||||
* i was supposed to wrap around from (incremented) destCPCount to 0,
|
||||
* incrementing n each time, so we'll fix that now:
|
||||
*/
|
||||
if (i / destCPCount > (0x7fffffff - n)) {
|
||||
/* integer overflow */
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
n += i / destCPCount;
|
||||
i %= destCPCount;
|
||||
/* not needed for Punycode: */
|
||||
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
|
||||
|
||||
if (n > 0x10ffff || isSurrogate(n)) {
|
||||
/* Unicode code point overflow */
|
||||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
/* Insert n at position i of the output: */
|
||||
cpLength = UTF16.getCharCount(n);
|
||||
if ((destLength + cpLength) < destCapacity) {
|
||||
int codeUnitIndex;
|
||||
|
||||
/*
|
||||
* Handle indexes when supplementary code points are present.
|
||||
*
|
||||
* In almost all cases, there will be only BMP code points before i and even in
|
||||
* the entire string. This is handled with the same efficiency as with UTF-32.
|
||||
*
|
||||
* Only the rare cases with supplementary code points are handled more slowly -
|
||||
* but not too bad since this is an insertion anyway.
|
||||
*/
|
||||
if (i <= firstSupplementaryIndex) {
|
||||
codeUnitIndex = i;
|
||||
if (cpLength > 1) {
|
||||
firstSupplementaryIndex = codeUnitIndex;
|
||||
} else {
|
||||
++firstSupplementaryIndex;
|
||||
}
|
||||
} else {
|
||||
codeUnitIndex = firstSupplementaryIndex;
|
||||
codeUnitIndex = UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i - codeUnitIndex);
|
||||
}
|
||||
|
||||
/* use the UChar index codeUnitIndex instead of the code point index i */
|
||||
if (codeUnitIndex < destLength) {
|
||||
System.arraycopy(dest, codeUnitIndex, dest, codeUnitIndex + cpLength, (destLength - codeUnitIndex));
|
||||
if (caseFlags != null) {
|
||||
System.arraycopy(caseFlags, codeUnitIndex, caseFlags, codeUnitIndex + cpLength,
|
||||
destLength - codeUnitIndex);
|
||||
}
|
||||
}
|
||||
if (cpLength == 1) {
|
||||
/* BMP, insert one code unit */
|
||||
dest[codeUnitIndex] = (char) n;
|
||||
} else {
|
||||
/* supplementary character, insert two code units */
|
||||
dest[codeUnitIndex] = UTF16.getLeadSurrogate(n);
|
||||
dest[codeUnitIndex + 1] = UTF16.getTrailSurrogate(n);
|
||||
}
|
||||
if (caseFlags != null) {
|
||||
/* Case of last character determines uppercase flag: */
|
||||
caseFlags[codeUnitIndex] = isBasicUpperCase(src.charAt(in - 1));
|
||||
if (cpLength == 2) {
|
||||
caseFlags[codeUnitIndex + 1] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
destLength += cpLength;
|
||||
++i;
|
||||
}
|
||||
result.append(dest, 0, destLength);
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import jdk_internal.icu.text.Replaceable;
|
||||
import jdk_internal.icu.text.ReplaceableString;
|
||||
import jdk_internal.icu.text.UCharacterIterator;
|
||||
|
||||
/**
|
||||
* DLF docs must define behavior when Replaceable is mutated underneath the
|
||||
* iterator.
|
||||
*
|
||||
* This and ICUCharacterIterator share some code, maybe they should share an
|
||||
* implementation, or the common state and implementation should be moved up
|
||||
* into UCharacterIterator.
|
||||
*
|
||||
* What are first, last, and getBeginIndex doing here?!?!?!
|
||||
*/
|
||||
public class ReplaceableUCharacterIterator extends UCharacterIterator {
|
||||
|
||||
// public constructor ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
*
|
||||
* @param str text which the iterator will be based on
|
||||
*/
|
||||
public ReplaceableUCharacterIterator(String str) {
|
||||
if (str == null) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(str);
|
||||
this.currentIndex = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
*
|
||||
* @param buf buffer of text on which the iterator will be based
|
||||
*/
|
||||
public ReplaceableUCharacterIterator(StringBuffer buf) {
|
||||
if (buf == null) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(buf);
|
||||
this.currentIndex = 0;
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, does not clone the underlying
|
||||
* <code>Replaceable</code>object
|
||||
*
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone() {
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // never invoked
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current UTF16 character.
|
||||
*
|
||||
* @return current UTF16 character
|
||||
*/
|
||||
public int current() {
|
||||
if (currentIndex < replaceable.length()) {
|
||||
return replaceable.charAt(currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
*
|
||||
* @return length of the text
|
||||
*/
|
||||
public int getLength() {
|
||||
return replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current currentIndex in text.
|
||||
*
|
||||
* @return current currentIndex in text.
|
||||
*/
|
||||
public int getIndex() {
|
||||
return currentIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
|
||||
* If the resulting currentIndex is greater or equal to the text length, the
|
||||
* currentIndex is reset to the text length and a value of DONECODEPOINT is
|
||||
* returned.
|
||||
*
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off
|
||||
* the end of the text range.
|
||||
*/
|
||||
public int next() {
|
||||
if (currentIndex < replaceable.length()) {
|
||||
return replaceable.charAt(currentIndex++);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous UTF16 character and decrements the iterator's currentIndex
|
||||
* by 1. If the resulting currentIndex is less than 0, the currentIndex is reset
|
||||
* to 0 and a value of DONECODEPOINT is returned.
|
||||
*
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off
|
||||
* the start of the text range.
|
||||
*/
|
||||
public int previous() {
|
||||
if (currentIndex > 0) {
|
||||
return replaceable.charAt(--currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the currentIndex to the specified currentIndex in the text and returns
|
||||
* that single UTF16 character at currentIndex. This assumes the text is stored
|
||||
* as 16-bit code units.
|
||||
*
|
||||
* @param currentIndex the currentIndex within the text.
|
||||
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
|
||||
* supplied. i.e. currentIndex is out of
|
||||
* bounds.
|
||||
*/
|
||||
public void setIndex(int currentIndex) {
|
||||
if (currentIndex < 0 || currentIndex > replaceable.length()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.currentIndex = currentIndex;
|
||||
}
|
||||
|
||||
public int getText(char[] fillIn, int offset) {
|
||||
int length = replaceable.length();
|
||||
if (offset < 0 || offset + length > fillIn.length) {
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
replaceable.getChars(0, length, fillIn, offset);
|
||||
return length;
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Replaceable object
|
||||
*/
|
||||
private Replaceable replaceable;
|
||||
/**
|
||||
* Current currentIndex
|
||||
*/
|
||||
private int currentIndex;
|
||||
|
||||
}
|
@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 2003, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*
|
||||
* Created on May 2, 2003
|
||||
*
|
||||
* To change the template for this generated file go to
|
||||
* Window>Preferences>Java>Code Generation>Code and Comments
|
||||
*/
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/impl/StringPrepDataReader.java
|
||||
// - move from package com.ibm.icu.impl to package sun.net.idn
|
||||
//
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* @author ram
|
||||
*
|
||||
* To change the template for this generated type comment go to
|
||||
* Window>Preferences>Java>Code Generation>Code and Comments
|
||||
*/
|
||||
public final class StringPrepDataReader implements ICUBinary.Authenticate {
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* private constructor.
|
||||
* </p>
|
||||
*
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
* @draft 2.1
|
||||
*/
|
||||
public StringPrepDataReader(InputStream inputStream) throws IOException {
|
||||
|
||||
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
|
||||
|
||||
dataInputStream = new DataInputStream(inputStream);
|
||||
|
||||
}
|
||||
|
||||
public void read(byte[] idnaBytes, char[] mappingTable) throws IOException {
|
||||
|
||||
// Read the bytes that make up the idnaTrie
|
||||
dataInputStream.read(idnaBytes);
|
||||
|
||||
// Read the extra data
|
||||
for (int i = 0; i < mappingTable.length; i++) {
|
||||
mappingTable[i] = dataInputStream.readChar();
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] getDataFormatVersion() {
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == DATA_FORMAT_VERSION[0] && version[2] == DATA_FORMAT_VERSION[2]
|
||||
&& version[3] == DATA_FORMAT_VERSION[3];
|
||||
}
|
||||
|
||||
public int[] readIndexes(int length) throws IOException {
|
||||
int[] indexes = new int[length];
|
||||
// Read the indexes
|
||||
for (int i = 0; i < length; i++) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
|
||||
public byte[] getUnicodeVersion() {
|
||||
return unicodeVersion;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream dataInputStream;
|
||||
private byte[] unicodeVersion;
|
||||
/**
|
||||
* File format version that this class understands. No guarantees are made if a
|
||||
* older version is used see store.c of gennorm for more information and values
|
||||
*/
|
||||
/// * dataFormat="SPRP" 0x53, 0x50, 0x52, 0x50 */
|
||||
private static final byte DATA_FORMAT_ID[] = { (byte) 0x53, (byte) 0x50, (byte) 0x52, (byte) 0x50 };
|
||||
private static final byte DATA_FORMAT_VERSION[] = { (byte) 0x3, (byte) 0x2, (byte) 0x5, (byte) 0x2 };
|
||||
|
||||
}
|
368
sources/main/java/jdk_internal/icu/impl/Trie.java
Normal file
368
sources/main/java/jdk_internal/icu/impl/Trie.java
Normal file
@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A trie is a kind of compressed, serializable table of values associated with
|
||||
* Unicode code points (0..0x10ffff).
|
||||
* </p>
|
||||
* <p>
|
||||
* This class defines the basic structure of a trie and provides methods to
|
||||
* <b>retrieve the offsets to the actual data</b>.
|
||||
* </p>
|
||||
* <p>
|
||||
* Data will be the form of an array of basic types, char or int.
|
||||
* </p>
|
||||
* <p>
|
||||
* The actual data format will have to be specified by the user in the inner
|
||||
* static interface com.ibm.icu.impl.Trie.DataManipulate.
|
||||
* </p>
|
||||
* <p>
|
||||
* This trie implementation is optimized for getting offset while walking
|
||||
* forward through a UTF-16 string. Therefore, the simplest and fastest access
|
||||
* macros are the fromLead() and fromOffsetTrail() methods. The fromBMP() method
|
||||
* are a little more complicated; they get offsets even for lead surrogate
|
||||
* codepoints, while the fromLead() method get special "folded" offsets for lead
|
||||
* surrogate code units if there is relevant data associated with them. From
|
||||
* such a folded offsets, an offset needs to be extracted to supply to the
|
||||
* fromOffsetTrail() methods. To handle such supplementary codepoints, some
|
||||
* offset information are kept in the data.
|
||||
* </p>
|
||||
* <p>
|
||||
* Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve that
|
||||
* offset from the folded value for the lead surrogate unit.
|
||||
* </p>
|
||||
* <p>
|
||||
* For examples of use, see com.ibm.icu.impl.CharTrie or
|
||||
* com.ibm.icu.impl.IntTrie.
|
||||
* </p>
|
||||
*
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.CharTrie
|
||||
* @see com.ibm.icu.impl.IntTrie
|
||||
* @since release 2.1, Jan 01 2002
|
||||
*/
|
||||
public abstract class Trie {
|
||||
// public class declaration ----------------------------------------
|
||||
|
||||
/**
|
||||
* Character data in com.ibm.impl.Trie have different user-specified format for
|
||||
* different purposes. This interface specifies methods to be implemented in
|
||||
* order for com.ibm.impl.Trie, to surrogate offset information encapsulated
|
||||
* within the data.
|
||||
*/
|
||||
public static interface DataManipulate {
|
||||
/**
|
||||
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's data the
|
||||
* index array offset of the indexes for that lead surrogate.
|
||||
*
|
||||
* @param value data value for a surrogate from the trie, including the folding
|
||||
* offset
|
||||
* @return data offset or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
public int getFoldingOffset(int value);
|
||||
}
|
||||
|
||||
// default implementation
|
||||
private static class DefaultGetFoldingOffset implements DataManipulate {
|
||||
public int getFoldingOffset(int value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// protected constructor -------------------------------------------
|
||||
|
||||
/**
|
||||
* Trie constructor for CharTrie use.
|
||||
*
|
||||
* @param inputStream ICU data file input stream which contains the trie
|
||||
* @param dataManipulate object containing the information to parse the trie
|
||||
* data
|
||||
* @throws IOException thrown when input stream does not have the right header.
|
||||
*/
|
||||
protected Trie(InputStream inputStream, DataManipulate dataManipulate) throws IOException {
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
// Magic number to authenticate the data.
|
||||
int signature = input.readInt();
|
||||
m_options_ = input.readInt();
|
||||
|
||||
if (!checkHeader(signature)) {
|
||||
throw new IllegalArgumentException(
|
||||
"ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
|
||||
}
|
||||
|
||||
if (dataManipulate != null) {
|
||||
m_dataManipulate_ = dataManipulate;
|
||||
} else {
|
||||
m_dataManipulate_ = new DefaultGetFoldingOffset();
|
||||
}
|
||||
m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
|
||||
m_dataOffset_ = input.readInt();
|
||||
m_dataLength_ = input.readInt();
|
||||
unserialize(inputStream);
|
||||
}
|
||||
|
||||
// protected data members ------------------------------------------
|
||||
|
||||
/**
|
||||
* Lead surrogate code points' index displacement in the index array.
|
||||
*
|
||||
* <pre>{@code
|
||||
* 0x10000-0xd800=0x2800
|
||||
* 0x2800 >> INDEX_STAGE_1_SHIFT_
|
||||
* }</pre>
|
||||
*/
|
||||
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
|
||||
/**
|
||||
* Shift size for shifting right the input index. 1..9
|
||||
*/
|
||||
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
|
||||
/**
|
||||
* Shift size for shifting left the index array values. Increases possible data
|
||||
* size with 16-bit index values at the cost of compactability. This requires
|
||||
* blocks of stage 2 data to be aligned by DATA_GRANULARITY.
|
||||
* 0..INDEX_STAGE_1_SHIFT
|
||||
*/
|
||||
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
|
||||
/**
|
||||
* Number of data values in a stage 2 (data array) block.
|
||||
*/
|
||||
protected static final int DATA_BLOCK_LENGTH = 1 << INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Mask for getting the lower bits from the input index. DATA_BLOCK_LENGTH - 1.
|
||||
*/
|
||||
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
|
||||
/**
|
||||
* Surrogate mask to use when shifting offset to retrieve supplementary values
|
||||
*/
|
||||
protected static final int SURROGATE_MASK_ = 0x3FF;
|
||||
/**
|
||||
* Index or UTF16 characters
|
||||
*/
|
||||
protected char m_index_[];
|
||||
/**
|
||||
* Internal TrieValue which handles the parsing of the data value. This class is
|
||||
* to be implemented by the user
|
||||
*/
|
||||
protected DataManipulate m_dataManipulate_;
|
||||
/**
|
||||
* Start index of the data portion of the trie. CharTrie combines index and data
|
||||
* into a char array, so this is used to indicate the initial offset to the data
|
||||
* portion. Note this index always points to the initial value.
|
||||
*/
|
||||
protected int m_dataOffset_;
|
||||
/**
|
||||
* Length of the data array
|
||||
*/
|
||||
protected int m_dataLength_;
|
||||
|
||||
// protected methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the surrogate pair points to.
|
||||
*
|
||||
* @param lead lead surrogate
|
||||
* @param trail trailing surrogate
|
||||
* @return offset to data
|
||||
*/
|
||||
protected abstract int getSurrogateOffset(char lead, char trail);
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the index ch after variable offset points
|
||||
* to. Note for locating a non-supplementary character data offset, calling
|
||||
* <p>
|
||||
* getRawOffset(0, ch);
|
||||
* </p>
|
||||
* will do. Otherwise if it is a supplementary character formed by surrogates
|
||||
* lead and trail. Then we would have to call getRawOffset() with
|
||||
* getFoldingIndexOffset(). See getSurrogateOffset().
|
||||
*
|
||||
* @param offset index offset which ch is to start from
|
||||
* @param ch index to be used after offset
|
||||
* @return offset to the data
|
||||
*/
|
||||
protected final int getRawOffset(int offset, char ch) {
|
||||
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] << INDEX_STAGE_2_SHIFT_) + (ch & INDEX_STAGE_3_MASK_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to data which the BMP character points to Treats a lead
|
||||
* surrogate as a normal code point.
|
||||
*
|
||||
* @param ch BMP character
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getBMPOffset(char ch) {
|
||||
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
|
||||
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
|
||||
: getRawOffset(0, ch);
|
||||
// using a getRawOffset(ch) makes no diff
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which this lead surrogate character points to.
|
||||
* Data at the returned offset may contain folding offset information for the
|
||||
* next trailing surrogate character.
|
||||
*
|
||||
* @param ch lead surrogate character
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getLeadOffset(char ch) {
|
||||
return getRawOffset(0, ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal trie getter from a code point. Could be faster(?) but longer with
|
||||
* {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
|
||||
* Gets the offset to data which the codepoint points to
|
||||
*
|
||||
* @param ch codepoint
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getCodePointOffset(int ch) {
|
||||
// if ((ch >> 16) == 0) slower
|
||||
if (ch < 0) {
|
||||
return -1;
|
||||
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset()
|
||||
// works
|
||||
return getRawOffset(0, (char) ch);
|
||||
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
|
||||
// BMP codepoint
|
||||
return getBMPOffset((char) ch);
|
||||
} else if (ch <= UCharacter.MAX_VALUE) {
|
||||
// look at the construction of supplementary characters
|
||||
// trail forms the ends of it.
|
||||
return getSurrogateOffset(UTF16.getLeadSurrogate(ch), (char) (ch & SURROGATE_MASK_));
|
||||
} else {
|
||||
// return -1 if there is an error, in this case we return
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Parses the inputstream and creates the trie index with it.
|
||||
* </p>
|
||||
* <p>
|
||||
* This is overwritten by the child classes.
|
||||
*
|
||||
* @param inputStream input stream containing the trie information
|
||||
* @exception IOException thrown when data reading fails.
|
||||
*/
|
||||
protected void unserialize(InputStream inputStream) throws IOException {
|
||||
// indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
|
||||
m_index_ = new char[m_dataOffset_];
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
for (int i = 0; i < m_dataOffset_; i++) {
|
||||
m_index_[i] = input.readChar();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this is a 16 bit trie
|
||||
*
|
||||
* @return true if this is a 16 bit trie
|
||||
*/
|
||||
protected final boolean isCharTrie() {
|
||||
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Latin 1 option mask
|
||||
*/
|
||||
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
|
||||
/**
|
||||
* Constant number to authenticate the byte block
|
||||
*/
|
||||
protected static final int HEADER_SIGNATURE_ = 0x54726965;
|
||||
/**
|
||||
* Header option formatting
|
||||
*/
|
||||
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
|
||||
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
|
||||
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
|
||||
|
||||
/**
|
||||
* Flag indicator for Latin quick access data block
|
||||
*/
|
||||
private boolean m_isLatin1Linear_;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Trie options field.
|
||||
* </p>
|
||||
* <p>
|
||||
* options bit field:<br>
|
||||
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
|
||||
* 8 0 = 16-bit data, 1=32-bit data<br>
|
||||
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
|
||||
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
|
||||
*/
|
||||
private int m_options_;
|
||||
|
||||
// private methods ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* Authenticates raw data header. Checking the header information, signature and
|
||||
* options.
|
||||
*
|
||||
* @param signature This contains the options and type of a Trie
|
||||
* @return true if the header is authenticated valid
|
||||
*/
|
||||
private final boolean checkHeader(int signature) {
|
||||
// check the signature
|
||||
// Trie in big-endian US-ASCII (0x54726965).
|
||||
// Magic number to authenticate the data.
|
||||
if (signature != HEADER_SIGNATURE_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_1_SHIFT_
|
||||
|| ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_2_SHIFT_) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
652
sources/main/java/jdk_internal/icu/impl/Trie2.java
Normal file
652
sources/main/java/jdk_internal/icu/impl/Trie2.java
Normal file
@ -0,0 +1,652 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* This is the interface and common implementation of a Unicode Trie2. It is a
|
||||
* kind of compressed table that maps from Unicode code points (0..0x10ffff) to
|
||||
* 16- or 32-bit integer values. It works best when there are ranges of
|
||||
* characters with the same value, which is generally the case with Unicode
|
||||
* character properties.
|
||||
*
|
||||
* This is the second common version of a Unicode trie (hence the name Trie2).
|
||||
*
|
||||
*/
|
||||
abstract class Trie2 implements Iterable<Trie2.Range> {
|
||||
|
||||
/**
|
||||
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
|
||||
*
|
||||
* Reads from the current position and leaves the buffer after the end of the
|
||||
* trie.
|
||||
*
|
||||
* The serialized format is identical between ICU4C and ICU4J, so this function
|
||||
* will work with serialized Trie2s from either.
|
||||
*
|
||||
* The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32,
|
||||
* depending on the width of the data.
|
||||
*
|
||||
* To obtain the width of the Trie2, check the actual class type of the returned
|
||||
* Trie2. Or use the createFromSerialized() function of Trie2_16 or Trie2_32,
|
||||
* which will return only Tries of their specific type/size.
|
||||
*
|
||||
* The serialized Trie2 on the stream may be in either little or big endian byte
|
||||
* order. This allows using serialized Tries from ICU4C without needing to
|
||||
* consider the byte order of the system that created them.
|
||||
*
|
||||
* @param bytes a byte buffer to the serialized form of a UTrie2.
|
||||
* @return An unserialized Trie2, ready for use.
|
||||
* @throws IllegalArgumentException if the stream does not contain a serialized
|
||||
* Trie2.
|
||||
* @throws IOException if a read error occurs in the buffer.
|
||||
*
|
||||
*/
|
||||
public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException {
|
||||
// From ICU4C utrie2_impl.h
|
||||
// * Trie2 data structure in serialized form:
|
||||
// *
|
||||
// * UTrie2Header header;
|
||||
// * uint16_t index[header.index2Length];
|
||||
// * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
|
||||
// * @internal
|
||||
// */
|
||||
// typedef struct UTrie2Header {
|
||||
// /** "Tri2" in big-endian US-ASCII (0x54726932) */
|
||||
// uint32_t signature;
|
||||
|
||||
// /**
|
||||
// * options bit field:
|
||||
// * 15.. 4 reserved (0)
|
||||
// * 3.. 0 UTrie2ValueBits valueBits
|
||||
// */
|
||||
// uint16_t options;
|
||||
//
|
||||
// /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
|
||||
// uint16_t indexLength;
|
||||
//
|
||||
// /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
|
||||
// uint16_t shiftedDataLength;
|
||||
//
|
||||
// /** Null index and data blocks, not shifted. */
|
||||
// uint16_t index2NullOffset, dataNullOffset;
|
||||
//
|
||||
// /**
|
||||
// * First code point of the single-value range ending with U+10ffff,
|
||||
// * rounded up and then shifted right by UTRIE2_SHIFT_1.
|
||||
// */
|
||||
// uint16_t shiftedHighStart;
|
||||
// } UTrie2Header;
|
||||
|
||||
ByteOrder outerByteOrder = bytes.order();
|
||||
try {
|
||||
UTrie2Header header = new UTrie2Header();
|
||||
|
||||
/* check the signature */
|
||||
header.signature = bytes.getInt();
|
||||
switch (header.signature) {
|
||||
case 0x54726932:
|
||||
// The buffer is already set to the trie data byte order.
|
||||
break;
|
||||
case 0x32697254:
|
||||
// Temporarily reverse the byte order.
|
||||
boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN;
|
||||
bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN);
|
||||
header.signature = 0x54726932;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2");
|
||||
}
|
||||
|
||||
header.options = bytes.getChar();
|
||||
header.indexLength = bytes.getChar();
|
||||
header.shiftedDataLength = bytes.getChar();
|
||||
header.index2NullOffset = bytes.getChar();
|
||||
header.dataNullOffset = bytes.getChar();
|
||||
header.shiftedHighStart = bytes.getChar();
|
||||
|
||||
if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) {
|
||||
throw new IllegalArgumentException("UTrie2 serialized format error.");
|
||||
}
|
||||
|
||||
Trie2 This;
|
||||
This = new Trie2_16();
|
||||
This.header = header;
|
||||
|
||||
/* get the length values and offsets */
|
||||
This.indexLength = header.indexLength;
|
||||
This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT;
|
||||
This.index2NullOffset = header.index2NullOffset;
|
||||
This.dataNullOffset = header.dataNullOffset;
|
||||
This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1;
|
||||
This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY;
|
||||
This.highValueIndex += This.indexLength;
|
||||
|
||||
// Allocate the Trie2 index array. If the data width is 16 bits, the array also
|
||||
// includes the space for the data.
|
||||
|
||||
int indexArraySize = This.indexLength;
|
||||
indexArraySize += This.dataLength;
|
||||
This.index = new char[indexArraySize];
|
||||
|
||||
/* Read in the index */
|
||||
int i;
|
||||
for (i = 0; i < This.indexLength; i++) {
|
||||
This.index[i] = bytes.getChar();
|
||||
}
|
||||
|
||||
/*
|
||||
* Read in the data. 16 bit data goes in the same array as the index. 32 bit
|
||||
* data goes in its own separate data array.
|
||||
*/
|
||||
This.data16 = This.indexLength;
|
||||
for (i = 0; i < This.dataLength; i++) {
|
||||
This.index[This.data16 + i] = bytes.getChar();
|
||||
}
|
||||
|
||||
This.data32 = null;
|
||||
This.initialValue = This.index[This.dataNullOffset];
|
||||
This.errorValue = This.index[This.data16 + UTRIE2_BAD_UTF8_DATA_OFFSET];
|
||||
|
||||
return This;
|
||||
} finally {
|
||||
bytes.order(outerByteOrder);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value for a code point as stored in the Trie2.
|
||||
*
|
||||
* @param codePoint the code point
|
||||
* @return the value
|
||||
*/
|
||||
public abstract int get(int codePoint);
|
||||
|
||||
/**
|
||||
* Get the trie value for a UTF-16 code unit.
|
||||
*
|
||||
* A Trie2 stores two distinct values for input in the lead surrogate range, one
|
||||
* for lead surrogates, which is the value that will be returned by this
|
||||
* function, and a second value that is returned by Trie2.get().
|
||||
*
|
||||
* For code units outside of the lead surrogate range, this function returns the
|
||||
* same result as Trie2.get().
|
||||
*
|
||||
* This function, together with the alternate value for lead surrogates, makes
|
||||
* possible very efficient processing of UTF-16 strings without first converting
|
||||
* surrogate pairs to their corresponding 32 bit code point values.
|
||||
*
|
||||
* At build-time, enumerate the contents of the Trie2 to see if there is
|
||||
* non-trivial (non-initialValue) data for any of the supplementary code points
|
||||
* associated with a lead surrogate. If so, then set a special
|
||||
* (application-specific) value for the lead surrogate code _unit_, with
|
||||
* Trie2Writable.setForLeadSurrogateCodeUnit().
|
||||
*
|
||||
* At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial data
|
||||
* and the code unit is a lead surrogate, then check if a trail surrogate
|
||||
* follows. If so, assemble the supplementary code point and look up its value
|
||||
* with Trie2.get(); otherwise reset the lead surrogate's value or do a code
|
||||
* point lookup for it.
|
||||
*
|
||||
* If there is only trivial data for lead and trail surrogates, then processing
|
||||
* can often skip them. For example, in normalization or case mapping all
|
||||
* characters that do not have any mappings are simply copied as is.
|
||||
*
|
||||
* @param c the code point or lead surrogate value.
|
||||
* @return the value
|
||||
*/
|
||||
public abstract int getFromU16SingleLead(char c);
|
||||
|
||||
/**
|
||||
* When iterating over the contents of a Trie2, Elements of this type are
|
||||
* produced. The iterator will return one item for each contiguous range of
|
||||
* codepoints having the same value.
|
||||
*
|
||||
* When iterating, the same Trie2EnumRange object will be reused and returned
|
||||
* for each range. If you need to retain complete iteration results, clone each
|
||||
* returned Trie2EnumRange, or save the range in some other way, before
|
||||
* advancing to the next iteration step.
|
||||
*/
|
||||
public static class Range {
|
||||
public int startCodePoint;
|
||||
public int endCodePoint; // Inclusive.
|
||||
public int value;
|
||||
public boolean leadSurrogate;
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == null || !(other.getClass().equals(getClass()))) {
|
||||
return false;
|
||||
}
|
||||
Range tother = (Range) other;
|
||||
return this.startCodePoint == tother.startCodePoint && this.endCodePoint == tother.endCodePoint
|
||||
&& this.value == tother.value && this.leadSurrogate == tother.leadSurrogate;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int h = initHash();
|
||||
h = hashUChar32(h, startCodePoint);
|
||||
h = hashUChar32(h, endCodePoint);
|
||||
h = hashInt(h, value);
|
||||
h = hashByte(h, leadSurrogate ? 1 : 0);
|
||||
return h;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an iterator over the value ranges in this Trie2. Values from the Trie2
|
||||
* are not remapped or filtered, but are returned as they are stored in the
|
||||
* Trie2.
|
||||
*
|
||||
* @return an Iterator
|
||||
*/
|
||||
public Iterator<Range> iterator() {
|
||||
return iterator(defaultValueMapper);
|
||||
}
|
||||
|
||||
private static ValueMapper defaultValueMapper = new ValueMapper() {
|
||||
public int map(int in) {
|
||||
return in;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Create an iterator over the value ranges from this Trie2. Values from the
|
||||
* Trie2 are passed through a caller-supplied remapping function, and it is the
|
||||
* remapped values that determine the ranges that will be produced by the
|
||||
* iterator.
|
||||
*
|
||||
*
|
||||
* @param mapper provides a function to remap values obtained from the Trie2.
|
||||
* @return an Iterator
|
||||
*/
|
||||
public Iterator<Range> iterator(ValueMapper mapper) {
|
||||
return new Trie2Iterator(mapper);
|
||||
}
|
||||
|
||||
/**
|
||||
* When iterating over the contents of a Trie2, an instance of TrieValueMapper
|
||||
* may be used to remap the values from the Trie2. The remapped values will be
|
||||
* used both in determining the ranges of codepoints and as the value to be
|
||||
* returned for each range.
|
||||
*
|
||||
* Example of use, with an anonymous subclass of TrieValueMapper:
|
||||
*
|
||||
*
|
||||
* ValueMapper m = new ValueMapper() { int map(int in) {return in & 0x1f;}; }
|
||||
* for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) {
|
||||
* Trie2EnumRange r = i.next(); ... // Do something with the range r. }
|
||||
*
|
||||
*/
|
||||
public interface ValueMapper {
|
||||
public int map(int originalVal);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Below this point are internal implementation items. No further public API.
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Trie2 data structure in serialized form:
|
||||
*
|
||||
* UTrie2Header header; uint16_t index[header.index2Length]; uint16_t
|
||||
* data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
|
||||
*
|
||||
* For Java, this is read from the stream into an instance of UTrie2Header. (The
|
||||
* C version just places a struct over the raw serialized data.)
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static class UTrie2Header {
|
||||
/** "Tri2" in big-endian US-ASCII (0x54726932) */
|
||||
int signature;
|
||||
|
||||
/**
|
||||
* options bit field (uint16_t): 15.. 4 reserved (0) 3.. 0 UTrie2ValueBits
|
||||
* valueBits
|
||||
*/
|
||||
int options;
|
||||
|
||||
/** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */
|
||||
int indexLength;
|
||||
|
||||
/**
|
||||
* (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT
|
||||
* (uint16_t)
|
||||
*/
|
||||
int shiftedDataLength;
|
||||
|
||||
/** Null index and data blocks, not shifted. (uint16_t) */
|
||||
int index2NullOffset, dataNullOffset;
|
||||
|
||||
/**
|
||||
* First code point of the single-value range ending with U+10ffff, rounded up
|
||||
* and then shifted right by UTRIE2_SHIFT_1. (uint16_t)
|
||||
*/
|
||||
int shiftedHighStart;
|
||||
}
|
||||
|
||||
//
|
||||
// Data members of UTrie2.
|
||||
//
|
||||
UTrie2Header header;
|
||||
char index[]; // Index array. Includes data for 16 bit Tries.
|
||||
int data16; // Offset to data portion of the index array, if 16 bit data.
|
||||
// zero if 32 bit data.
|
||||
int data32[]; // NULL if 16b data is used via index
|
||||
|
||||
int indexLength;
|
||||
int dataLength;
|
||||
int index2NullOffset; // 0xffff if there is no dedicated index-2 null block
|
||||
int initialValue;
|
||||
|
||||
/** Value returned for out-of-range code points and illegal UTF-8. */
|
||||
int errorValue;
|
||||
|
||||
/* Start of the last range which ends at U+10ffff, and its value. */
|
||||
int highStart;
|
||||
int highValueIndex;
|
||||
|
||||
int dataNullOffset;
|
||||
|
||||
/**
|
||||
* Trie2 constants, defining shift widths, index array lengths, etc.
|
||||
*
|
||||
* These are needed for the runtime macros but users can treat these as
|
||||
* implementation details and skip to the actual public API further below.
|
||||
*/
|
||||
|
||||
static final int UTRIE2_OPTIONS_VALUE_BITS_MASK = 0x000f;
|
||||
|
||||
/** Shift size for getting the index-1 table offset. */
|
||||
static final int UTRIE2_SHIFT_1 = 6 + 5;
|
||||
|
||||
/** Shift size for getting the index-2 table offset. */
|
||||
static final int UTRIE2_SHIFT_2 = 5;
|
||||
|
||||
/**
|
||||
* Difference between the two shift sizes, for getting an index-1 offset from an
|
||||
* index-2 offset. 6=11-5
|
||||
*/
|
||||
static final int UTRIE2_SHIFT_1_2 = UTRIE2_SHIFT_1 - UTRIE2_SHIFT_2;
|
||||
|
||||
/**
|
||||
* Number of index-1 entries for the BMP. 32=0x20 This part of the index-1 table
|
||||
* is omitted from the serialized form.
|
||||
*/
|
||||
static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UTRIE2_SHIFT_1;
|
||||
|
||||
/** Number of entries in an index-2 block. 64=0x40 */
|
||||
static final int UTRIE2_INDEX_2_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_1_2;
|
||||
|
||||
/** Mask for getting the lower bits for the in-index-2-block offset. */
|
||||
static final int UTRIE2_INDEX_2_MASK = UTRIE2_INDEX_2_BLOCK_LENGTH - 1;
|
||||
|
||||
/** Number of entries in a data block. 32=0x20 */
|
||||
static final int UTRIE2_DATA_BLOCK_LENGTH = 1 << UTRIE2_SHIFT_2;
|
||||
|
||||
/** Mask for getting the lower bits for the in-data-block offset. */
|
||||
static final int UTRIE2_DATA_MASK = UTRIE2_DATA_BLOCK_LENGTH - 1;
|
||||
|
||||
/**
|
||||
* Shift size for shifting left the index array values. Increases possible data
|
||||
* size with 16-bit index values at the cost of compactability. This requires
|
||||
* data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
|
||||
*/
|
||||
static final int UTRIE2_INDEX_SHIFT = 2;
|
||||
|
||||
/** The alignment size of a data block. Also the granularity for compaction. */
|
||||
static final int UTRIE2_DATA_GRANULARITY = 1 << UTRIE2_INDEX_SHIFT;
|
||||
|
||||
/**
|
||||
* The part of the index-2 table for U+D800..U+DBFF stores values for lead
|
||||
* surrogate code _units_ not code _points_. Values for lead surrogate code
|
||||
* _points_ are indexed with this portion of the table.
|
||||
* Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
|
||||
*/
|
||||
static final int UTRIE2_LSCP_INDEX_2_OFFSET = 0x10000 >> UTRIE2_SHIFT_2;
|
||||
static final int UTRIE2_LSCP_INDEX_2_LENGTH = 0x400 >> UTRIE2_SHIFT_2;
|
||||
|
||||
/** Count the lengths of both BMP pieces. 2080=0x820 */
|
||||
static final int UTRIE2_INDEX_2_BMP_LENGTH = UTRIE2_LSCP_INDEX_2_OFFSET + UTRIE2_LSCP_INDEX_2_LENGTH;
|
||||
|
||||
/**
|
||||
* The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
|
||||
* Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
|
||||
*/
|
||||
static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH;
|
||||
static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH = 0x800 >> 6; /* U+0800 is the first code point after 2-byte UTF-8 */
|
||||
|
||||
/**
|
||||
* The index-1 table, only used for supplementary code points, at offset
|
||||
* 2112=0x840. Variable length, for code points up to highStart, where the last
|
||||
* single-value range starts. Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
|
||||
* (For 0x100000 supplementary code points U+10000..U+10ffff.)
|
||||
*
|
||||
* The part of the index-2 table for supplementary code points starts after this
|
||||
* index-1 table.
|
||||
*
|
||||
* Both the index-1 table and the following part of the index-2 table are
|
||||
* omitted completely if there is only BMP data.
|
||||
*/
|
||||
static final int UTRIE2_INDEX_1_OFFSET = UTRIE2_UTF8_2B_INDEX_2_OFFSET + UTRIE2_UTF8_2B_INDEX_2_LENGTH;
|
||||
|
||||
/**
|
||||
* The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
|
||||
* Used with linear access for single bytes 0..0xbf for simple error handling.
|
||||
* Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
|
||||
*/
|
||||
static final int UTRIE2_BAD_UTF8_DATA_OFFSET = 0x80;
|
||||
|
||||
/**
|
||||
* Implementation class for an iterator over a Trie2.
|
||||
*
|
||||
* Iteration over a Trie2 first returns all of the ranges that are indexed by
|
||||
* code points, then returns the special alternate values for the lead
|
||||
* surrogates
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
class Trie2Iterator implements Iterator<Range> {
|
||||
|
||||
// The normal constructor that configures the iterator to cover the complete
|
||||
// contents of the Trie2
|
||||
Trie2Iterator(ValueMapper vm) {
|
||||
mapper = vm;
|
||||
nextStart = 0;
|
||||
limitCP = 0x110000;
|
||||
doLeadSurrogates = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* The main next() function for Trie2 iterators
|
||||
*
|
||||
*/
|
||||
public Range next() {
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
if (nextStart >= limitCP) {
|
||||
// Switch over from iterating normal code point values to
|
||||
// doing the alternate lead-surrogate values.
|
||||
doingCodePoints = false;
|
||||
nextStart = 0xd800;
|
||||
}
|
||||
int endOfRange = 0;
|
||||
int val = 0;
|
||||
int mappedVal = 0;
|
||||
|
||||
if (doingCodePoints) {
|
||||
// Iteration over code point values.
|
||||
val = get(nextStart);
|
||||
mappedVal = mapper.map(val);
|
||||
endOfRange = rangeEnd(nextStart, limitCP, val);
|
||||
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
|
||||
// Loop continues so long as the mapped values are the same.
|
||||
for (;;) {
|
||||
if (endOfRange >= limitCP - 1) {
|
||||
break;
|
||||
}
|
||||
val = get(endOfRange + 1);
|
||||
if (mapper.map(val) != mappedVal) {
|
||||
break;
|
||||
}
|
||||
endOfRange = rangeEnd(endOfRange + 1, limitCP, val);
|
||||
}
|
||||
} else {
|
||||
// Iteration over the alternate lead surrogate values.
|
||||
val = getFromU16SingleLead((char) nextStart);
|
||||
mappedVal = mapper.map(val);
|
||||
endOfRange = rangeEndLS((char) nextStart);
|
||||
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
|
||||
// Loop continues so long as the mapped values are the same.
|
||||
for (;;) {
|
||||
if (endOfRange >= 0xdbff) {
|
||||
break;
|
||||
}
|
||||
val = getFromU16SingleLead((char) (endOfRange + 1));
|
||||
if (mapper.map(val) != mappedVal) {
|
||||
break;
|
||||
}
|
||||
endOfRange = rangeEndLS((char) (endOfRange + 1));
|
||||
}
|
||||
}
|
||||
returnValue.startCodePoint = nextStart;
|
||||
returnValue.endCodePoint = endOfRange;
|
||||
returnValue.value = mappedVal;
|
||||
returnValue.leadSurrogate = !doingCodePoints;
|
||||
nextStart = endOfRange + 1;
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00;
|
||||
}
|
||||
|
||||
private int rangeEndLS(char startingLS) {
|
||||
if (startingLS >= 0xdbff) {
|
||||
return 0xdbff;
|
||||
}
|
||||
|
||||
int c;
|
||||
int val = getFromU16SingleLead(startingLS);
|
||||
for (c = startingLS + 1; c <= 0x0dbff; c++) {
|
||||
if (getFromU16SingleLead((char) c) != val) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return c - 1;
|
||||
}
|
||||
|
||||
//
|
||||
// Iteration State Variables
|
||||
//
|
||||
private ValueMapper mapper;
|
||||
private Range returnValue = new Range();
|
||||
// The starting code point for the next range to be returned.
|
||||
private int nextStart;
|
||||
// The upper limit for the last normal range to be returned. Normally 0x110000,
|
||||
// but
|
||||
// may be lower when iterating over the code points for a single lead surrogate.
|
||||
private int limitCP;
|
||||
|
||||
// True while iterating over the Trie2 values for code points.
|
||||
// False while iterating over the alternate values for lead surrogates.
|
||||
private boolean doingCodePoints = true;
|
||||
|
||||
// True if the iterator should iterate the special values for lead surrogates in
|
||||
// addition to the normal values for code points.
|
||||
private boolean doLeadSurrogates = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the last character in a contiguous range of characters with the same
|
||||
* Trie2 value as the input character.
|
||||
*
|
||||
* @param c The character to begin with.
|
||||
* @return The last contiguous character with the same value.
|
||||
*/
|
||||
int rangeEnd(int start, int limitp, int val) {
|
||||
int c;
|
||||
int limit = Math.min(highStart, limitp);
|
||||
|
||||
for (c = start + 1; c < limit; c++) {
|
||||
if (get(c) != val) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c >= highStart) {
|
||||
c = limitp;
|
||||
}
|
||||
return c - 1;
|
||||
}
|
||||
|
||||
//
|
||||
// Hashing implementation functions. FNV hash. Respected public domain
|
||||
// algorithm.
|
||||
//
|
||||
private static int initHash() {
|
||||
return 0x811c9DC5; // unsigned 2166136261
|
||||
}
|
||||
|
||||
private static int hashByte(int h, int b) {
|
||||
h = h * 16777619;
|
||||
h = h ^ b;
|
||||
return h;
|
||||
}
|
||||
|
||||
private static int hashUChar32(int h, int c) {
|
||||
h = Trie2.hashByte(h, c & 255);
|
||||
h = Trie2.hashByte(h, (c >> 8) & 255);
|
||||
h = Trie2.hashByte(h, c >> 16);
|
||||
return h;
|
||||
}
|
||||
|
||||
private static int hashInt(int h, int i) {
|
||||
h = Trie2.hashByte(h, i & 255);
|
||||
h = Trie2.hashByte(h, (i >> 8) & 255);
|
||||
h = Trie2.hashByte(h, (i >> 16) & 255);
|
||||
h = Trie2.hashByte(h, (i >> 24) & 255);
|
||||
return h;
|
||||
}
|
||||
|
||||
}
|
170
sources/main/java/jdk_internal/icu/impl/Trie2_16.java
Normal file
170
sources/main/java/jdk_internal/icu/impl/Trie2_16.java
Normal file
@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* @author aheninger
|
||||
*
|
||||
* A read-only Trie2, holding 16 bit data values.
|
||||
*
|
||||
* A Trie2 is a highly optimized data structure for mapping from Unicode
|
||||
* code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit
|
||||
* value.
|
||||
*
|
||||
* See class Trie2 for descriptions of the API for accessing the
|
||||
* contents of a trie.
|
||||
*
|
||||
* The fundamental data access methods are declared final in this class,
|
||||
* with the intent that applications might gain a little extra
|
||||
* performance, when compared with calling the same methods via the
|
||||
* abstract UTrie2 base class.
|
||||
*/
|
||||
public final class Trie2_16 extends Trie2 {
|
||||
|
||||
/**
|
||||
* Internal constructor, not for general use.
|
||||
*/
|
||||
Trie2_16() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). The
|
||||
* serialized format is identical between ICU4C and ICU4J, so this function will
|
||||
* work with serialized Trie2s from either.
|
||||
*
|
||||
* The serialized Trie2 in the bytes may be in either little or big endian byte
|
||||
* order. This allows using serialized Tries from ICU4C without needing to
|
||||
* consider the byte order of the system that created them.
|
||||
*
|
||||
* @param bytes a byte buffer to the serialized form of a UTrie2.
|
||||
* @return An unserialized Trie2_16, ready for use.
|
||||
* @throws IllegalArgumentException if the buffer does not contain a serialized
|
||||
* Trie2.
|
||||
* @throws IOException if a read error occurs in the buffer.
|
||||
* @throws ClassCastException if the bytes contain a serialized Trie2_32
|
||||
*/
|
||||
public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException {
|
||||
return (Trie2_16) Trie2.createFromSerialized(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value for a code point as stored in the Trie2.
|
||||
*
|
||||
* @param codePoint the code point
|
||||
* @return the value
|
||||
*/
|
||||
@Override
|
||||
public final int get(int codePoint) {
|
||||
int value;
|
||||
int ix;
|
||||
|
||||
if (codePoint >= 0) {
|
||||
if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
|
||||
// Ordinary BMP code point, excluding leading surrogates.
|
||||
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2
|
||||
// index.
|
||||
// 16 bit data is stored in the index array itself.
|
||||
ix = index[codePoint >> UTRIE2_SHIFT_2];
|
||||
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
|
||||
value = index[ix];
|
||||
return value;
|
||||
}
|
||||
if (codePoint <= 0xffff) {
|
||||
// Lead Surrogate Code Point. A Separate index section is stored for
|
||||
// lead surrogate code units and code points.
|
||||
// The main index has the code unit data.
|
||||
// For this function, we need the code point data.
|
||||
// Note: this expression could be refactored for slightly improved efficiency,
|
||||
// but
|
||||
// surrogate code points will be so rare in practice that it's not worth it.
|
||||
ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
|
||||
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
|
||||
value = index[ix];
|
||||
return value;
|
||||
}
|
||||
if (codePoint < highStart) {
|
||||
// Supplemental code point, use two-level lookup.
|
||||
ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
|
||||
ix = index[ix];
|
||||
ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
|
||||
ix = index[ix];
|
||||
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
|
||||
value = index[ix];
|
||||
return value;
|
||||
}
|
||||
if (codePoint <= 0x10ffff) {
|
||||
value = index[highValueIndex];
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
|
||||
return errorValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a Trie2 value for a UTF-16 code unit.
|
||||
*
|
||||
* This function returns the same value as get() if the input character is
|
||||
* outside of the lead surrogate range
|
||||
*
|
||||
* There are two values stored in a Trie2 for inputs in the lead surrogate
|
||||
* range. This function returns the alternate value, while Trie2.get() returns
|
||||
* the main value.
|
||||
*
|
||||
* @param codeUnit a 16 bit code unit or lead surrogate value.
|
||||
* @return the value
|
||||
*/
|
||||
@Override
|
||||
public int getFromU16SingleLead(char codeUnit) {
|
||||
int value;
|
||||
int ix;
|
||||
|
||||
// Because the input is a 16 bit char, we can skip the tests for it being in
|
||||
// the BMP range. It is.
|
||||
ix = index[codeUnit >> UTRIE2_SHIFT_2];
|
||||
ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
|
||||
value = index[ix];
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes of the serialized trie
|
||||
*/
|
||||
public int getSerializedLength() {
|
||||
return 16 + (header.indexLength + dataLength) * 2;
|
||||
}
|
||||
}
|
271
sources/main/java/jdk_internal/icu/impl/UBiDiProps.java
Normal file
271
sources/main/java/jdk_internal/icu/impl/UBiDiProps.java
Normal file
@ -0,0 +1,271 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2004-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: UBiDiProps.java
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2005jan16
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Low-level Unicode bidi/shaping properties access.
|
||||
* Java port of ubidi_props.h/.c.
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
|
||||
public final class UBiDiProps {
|
||||
// constructors etc. --------------------------------------------------- ***
|
||||
|
||||
// port of ubidi_openProps()
|
||||
private UBiDiProps() throws IOException {
|
||||
ByteBuffer bytes = ICUBinary.getRequiredData(DATA_FILE_NAME);
|
||||
readData(bytes);
|
||||
}
|
||||
|
||||
private void readData(ByteBuffer bytes) throws IOException {
|
||||
// read the header
|
||||
ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
|
||||
|
||||
// read indexes[]
|
||||
int i, count;
|
||||
count = bytes.getInt();
|
||||
if (count < IX_TOP) {
|
||||
throw new IOException("indexes[0] too small in " + DATA_FILE_NAME);
|
||||
}
|
||||
indexes = new int[count];
|
||||
|
||||
indexes[0] = count;
|
||||
for (i = 1; i < count; ++i) {
|
||||
indexes[i] = bytes.getInt();
|
||||
}
|
||||
|
||||
// read the trie
|
||||
trie = Trie2_16.createFromSerialized(bytes);
|
||||
int expectedTrieLength = indexes[IX_TRIE_SIZE];
|
||||
int trieLength = trie.getSerializedLength();
|
||||
if (trieLength > expectedTrieLength) {
|
||||
throw new IOException(DATA_FILE_NAME + ": not enough bytes for the trie");
|
||||
}
|
||||
// skip padding after trie bytes
|
||||
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
|
||||
|
||||
// read mirrors[]
|
||||
count = indexes[IX_MIRROR_LENGTH];
|
||||
if (count > 0) {
|
||||
mirrors = new int[count];
|
||||
for (i = 0; i < count; ++i) {
|
||||
mirrors[i] = bytes.getInt();
|
||||
}
|
||||
}
|
||||
|
||||
// read jgArray[]
|
||||
count = indexes[IX_JG_LIMIT] - indexes[IX_JG_START];
|
||||
jgArray = new byte[count];
|
||||
for (i = 0; i < count; ++i) {
|
||||
jgArray[i] = bytes.get();
|
||||
}
|
||||
|
||||
// read jgArray2[]
|
||||
count = indexes[IX_JG_LIMIT2] - indexes[IX_JG_START2];
|
||||
jgArray2 = new byte[count];
|
||||
for (i = 0; i < count; ++i) {
|
||||
jgArray2[i] = bytes.get();
|
||||
}
|
||||
}
|
||||
|
||||
// implement ICUBinary.Authenticate
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == 2;
|
||||
}
|
||||
}
|
||||
|
||||
// property access functions ------------------------------------------- ***
|
||||
|
||||
public final int getClass(int c) {
|
||||
return getClassFromProps(trie.get(c));
|
||||
}
|
||||
|
||||
private final int getMirror(int c, int props) {
|
||||
int delta = getMirrorDeltaFromProps(props);
|
||||
if (delta != ESC_MIRROR_DELTA) {
|
||||
return c + delta;
|
||||
} else {
|
||||
/* look for mirror code point in the mirrors[] table */
|
||||
int m;
|
||||
int i, length;
|
||||
int c2;
|
||||
|
||||
length = indexes[IX_MIRROR_LENGTH];
|
||||
|
||||
/* linear search */
|
||||
for (i = 0; i < length; ++i) {
|
||||
m = mirrors[i];
|
||||
c2 = getMirrorCodePoint(m);
|
||||
if (c == c2) {
|
||||
/* found c, return its mirror code point using the index in m */
|
||||
return getMirrorCodePoint(mirrors[getMirrorIndex(m)]);
|
||||
} else if (c < c2) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* c not found, return it itself */
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
public final int getMirror(int c) {
|
||||
int props = trie.get(c);
|
||||
return getMirror(c, props);
|
||||
}
|
||||
|
||||
public final int getJoiningType(int c) {
|
||||
return (trie.get(c) & JT_MASK) >> JT_SHIFT;
|
||||
}
|
||||
|
||||
public final int getJoiningGroup(int c) {
|
||||
int start, limit;
|
||||
|
||||
start = indexes[IX_JG_START];
|
||||
limit = indexes[IX_JG_LIMIT];
|
||||
if (start <= c && c < limit) {
|
||||
return (int) jgArray[c - start] & 0xff;
|
||||
}
|
||||
start = indexes[IX_JG_START2];
|
||||
limit = indexes[IX_JG_LIMIT2];
|
||||
if (start <= c && c < limit) {
|
||||
return (int) jgArray2[c - start] & 0xff;
|
||||
}
|
||||
return UCharacter.JoiningGroup.NO_JOINING_GROUP;
|
||||
}
|
||||
|
||||
public final int getPairedBracketType(int c) {
|
||||
return (trie.get(c) & BPT_MASK) >> BPT_SHIFT;
|
||||
}
|
||||
|
||||
public final int getPairedBracket(int c) {
|
||||
int props = trie.get(c);
|
||||
if ((props & BPT_MASK) == 0) {
|
||||
return c;
|
||||
} else {
|
||||
return getMirror(c, props);
|
||||
}
|
||||
}
|
||||
|
||||
// data members -------------------------------------------------------- ***
|
||||
private int indexes[];
|
||||
private int mirrors[];
|
||||
private byte jgArray[];
|
||||
private byte jgArray2[];
|
||||
|
||||
private Trie2_16 trie;
|
||||
|
||||
// data format constants ----------------------------------------------- ***
|
||||
@SuppressWarnings("deprecation")
|
||||
private static final String DATA_FILE_NAME = "/assets/eagler/icudt/ubidi.icu";
|
||||
|
||||
/* format "BiDi" */
|
||||
private static final int FMT = 0x42694469;
|
||||
|
||||
/* indexes into indexes[] */
|
||||
private static final int IX_TRIE_SIZE = 2;
|
||||
private static final int IX_MIRROR_LENGTH = 3;
|
||||
|
||||
private static final int IX_JG_START = 4;
|
||||
private static final int IX_JG_LIMIT = 5;
|
||||
private static final int IX_JG_START2 = 6; /* new in format version 2.2, ICU 54 */
|
||||
private static final int IX_JG_LIMIT2 = 7;
|
||||
|
||||
private static final int IX_TOP = 16;
|
||||
|
||||
// definitions for 16-bit bidi/shaping properties word ----------------- ***
|
||||
|
||||
/* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */
|
||||
private static final int JT_SHIFT = 5; /* joining type: 3 bits (7..5) */
|
||||
|
||||
private static final int BPT_SHIFT = 8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */
|
||||
|
||||
private static final int MIRROR_DELTA_SHIFT = 13; /* bidi mirroring delta: 3 bits (15..13) */
|
||||
|
||||
private static final int CLASS_MASK = 0x0000001f;
|
||||
private static final int JT_MASK = 0x000000e0;
|
||||
private static final int BPT_MASK = 0x00000300;
|
||||
|
||||
private static final int getClassFromProps(int props) {
|
||||
return props & CLASS_MASK;
|
||||
}
|
||||
|
||||
private static final boolean getFlagFromProps(int props, int shift) {
|
||||
return ((props >> shift) & 1) != 0;
|
||||
}
|
||||
|
||||
private static final int getMirrorDeltaFromProps(int props) {
|
||||
return (short) props >> MIRROR_DELTA_SHIFT;
|
||||
}
|
||||
|
||||
private static final int ESC_MIRROR_DELTA = -4;
|
||||
|
||||
// definitions for 32-bit mirror table entry --------------------------- ***
|
||||
|
||||
/* the source Unicode code point takes 21 bits (20..0) */
|
||||
private static final int MIRROR_INDEX_SHIFT = 21;
|
||||
|
||||
private static final int getMirrorCodePoint(int m) {
|
||||
return m & 0x1fffff;
|
||||
}
|
||||
|
||||
private static final int getMirrorIndex(int m) {
|
||||
return m >>> MIRROR_INDEX_SHIFT;
|
||||
}
|
||||
|
||||
/*
|
||||
* public singleton instance
|
||||
*/
|
||||
public static final UBiDiProps INSTANCE;
|
||||
|
||||
// This static initializer block must be placed after
|
||||
// other static member initialization
|
||||
static {
|
||||
try {
|
||||
INSTANCE = new UBiDiProps();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Missing resource: \"" + DATA_FILE_NAME + "\"; Reason: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
627
sources/main/java/jdk_internal/icu/impl/UCharacterProperty.java
Normal file
627
sources/main/java/jdk_internal/icu/impl/UCharacterProperty.java
Normal file
@ -0,0 +1,627 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Iterator;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter.HangulSyllableType;
|
||||
import jdk_internal.icu.lang.UCharacter.NumericType;
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
import jdk_internal.icu.text.UnicodeSet;
|
||||
import jdk_internal.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Internal class used for Unicode character property database.
|
||||
* </p>
|
||||
* <p>
|
||||
* This classes store binary data read from uprops.icu. It does not have the
|
||||
* capability to parse the data into more high-level information. It only
|
||||
* returns bytes of information when required.
|
||||
* </p>
|
||||
* <p>
|
||||
* Due to the form most commonly used for retrieval, array of char is used to
|
||||
* store the binary data.
|
||||
* </p>
|
||||
* <p>
|
||||
* UCharacterPropertyDB also contains information on accessing indexes to
|
||||
* significant points in the binary data.
|
||||
* </p>
|
||||
* <p>
|
||||
* Responsibility for molding the binary data into more meaning form lies on
|
||||
* <a href=UCharacter.html>UCharacter</a>.
|
||||
* </p>
|
||||
*
|
||||
* @author Syn Wee Quek
|
||||
* @since release 2.1, february 1st 2002
|
||||
*/
|
||||
|
||||
public final class UCharacterProperty {
|
||||
// public data members -----------------------------------------------
|
||||
|
||||
/*
|
||||
* public singleton instance
|
||||
*/
|
||||
public static final UCharacterProperty INSTANCE;
|
||||
|
||||
/**
|
||||
* Trie data
|
||||
*/
|
||||
public Trie2_16 m_trie_;
|
||||
|
||||
/**
|
||||
* Unicode version
|
||||
*/
|
||||
public VersionInfo m_unicodeVersion_;
|
||||
|
||||
/**
|
||||
* Character type mask
|
||||
*/
|
||||
public static final int TYPE_MASK = 0x1F;
|
||||
|
||||
// uprops.h enum UPropertySource --------------------------------------- ***
|
||||
|
||||
/** From uchar.c/uprops.icu main trie */
|
||||
public static final int SRC_CHAR = 1;
|
||||
/** From uchar.c/uprops.icu properties vectors trie */
|
||||
public static final int SRC_PROPSVEC = 2;
|
||||
/** From ubidi_props.c/ubidi.icu */
|
||||
public static final int SRC_BIDI = 5;
|
||||
/** From normalizer2impl.cpp/nfc.nrm */
|
||||
public static final int SRC_NFC = 8;
|
||||
/** From normalizer2impl.cpp/nfkc.nrm */
|
||||
public static final int SRC_NFKC = 9;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the main property value for code point ch.
|
||||
*
|
||||
* @param ch code point whose property value is to be retrieved
|
||||
* @return property value of code point
|
||||
*/
|
||||
public final int getProperty(int ch) {
|
||||
return m_trie_.get(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the unicode additional properties. Java version of C
|
||||
* u_getUnicodeProperties().
|
||||
*
|
||||
* @param codepoint codepoint whose additional properties is to be retrieved
|
||||
* @param column The column index.
|
||||
* @return unicode properties
|
||||
*/
|
||||
public int getAdditional(int codepoint, int column) {
|
||||
assert column >= 0;
|
||||
if (column >= m_additionalColumnsCount_) {
|
||||
return 0;
|
||||
}
|
||||
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Get the "age" of the code point.
|
||||
* </p>
|
||||
* <p>
|
||||
* The "age" is the Unicode version when the code point was first designated (as
|
||||
* a non-character or for Private Use) or assigned a character.
|
||||
* </p>
|
||||
* <p>
|
||||
* This can be useful to avoid emitting code points to receiving processes that
|
||||
* do not accept newer characters.
|
||||
* </p>
|
||||
* <p>
|
||||
* The data is from the UCD file DerivedAge.txt.
|
||||
* </p>
|
||||
* <p>
|
||||
* This API does not check the validity of the codepoint.
|
||||
* </p>
|
||||
*
|
||||
* @param codepoint The code point.
|
||||
* @return the Unicode version number
|
||||
*/
|
||||
public VersionInfo getAge(int codepoint) {
|
||||
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
|
||||
return VersionInfo.getInstance((version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
|
||||
version & LAST_NIBBLE_MASK_, 0, 0);
|
||||
}
|
||||
|
||||
// int-value and enumerated properties --------------------------------- ***
|
||||
|
||||
public int getType(int c) {
|
||||
return getProperty(c) & TYPE_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
|
||||
* Hangul_Syllable_Type is fully redundant with a subset of
|
||||
* Grapheme_Cluster_Break.
|
||||
*/
|
||||
private static final int /* UHangulSyllableType */ gcbToHst[] = { HangulSyllableType.NOT_APPLICABLE, /*
|
||||
* U_GCB_OTHER
|
||||
*/
|
||||
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
|
||||
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
|
||||
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
|
||||
HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
|
||||
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
|
||||
HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
|
||||
HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
|
||||
HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
|
||||
HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
|
||||
/*
|
||||
* Omit GCB values beyond what we need for hst. The code below checks for the
|
||||
* array length.
|
||||
*/
|
||||
};
|
||||
|
||||
private class IntProperty {
|
||||
int column; // SRC_PROPSVEC column, or "source" if mask==0
|
||||
int mask;
|
||||
int shift;
|
||||
|
||||
IntProperty(int column, int mask, int shift) {
|
||||
this.column = column;
|
||||
this.mask = mask;
|
||||
this.shift = shift;
|
||||
}
|
||||
|
||||
IntProperty(int source) {
|
||||
this.column = source;
|
||||
this.mask = 0;
|
||||
}
|
||||
|
||||
int getValue(int c) {
|
||||
// systematic, directly stored properties
|
||||
return (getAdditional(c, column) & mask) >>> shift;
|
||||
}
|
||||
}
|
||||
|
||||
private class BiDiIntProperty extends IntProperty {
|
||||
BiDiIntProperty() {
|
||||
super(SRC_BIDI);
|
||||
}
|
||||
}
|
||||
|
||||
private class CombiningClassIntProperty extends IntProperty {
|
||||
CombiningClassIntProperty(int source) {
|
||||
super(source);
|
||||
}
|
||||
}
|
||||
|
||||
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
|
||||
int which;
|
||||
int max;
|
||||
|
||||
NormQuickCheckIntProperty(int source, int which, int max) {
|
||||
super(source);
|
||||
this.which = which;
|
||||
this.max = max;
|
||||
}
|
||||
}
|
||||
|
||||
private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
|
||||
int getValue(int c) {
|
||||
return UBiDiProps.INSTANCE.getPairedBracketType(c);
|
||||
}
|
||||
};
|
||||
|
||||
public int getIntPropertyValue(int c, int which) {
|
||||
if (which == BIDI_PAIRED_BRACKET_TYPE) {
|
||||
return intProp.getValue(c);
|
||||
}
|
||||
return 0; // undefined
|
||||
}
|
||||
|
||||
/**
|
||||
* Forms a supplementary code point from the argument character<br>
|
||||
* Note this is for internal use hence no checks for the validity of the
|
||||
* surrogate characters are done
|
||||
*
|
||||
* @param lead lead surrogate character
|
||||
* @param trail trailing surrogate character
|
||||
* @return code point of the supplementary character
|
||||
*/
|
||||
public static int getRawSupplementary(char lead, char trail) {
|
||||
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the type mask
|
||||
*
|
||||
* @param type character type
|
||||
* @return mask
|
||||
*/
|
||||
public static final int getMask(int type) {
|
||||
return 1 << type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the digit values of characters like 'A' - 'Z', normal, half-width and
|
||||
* full-width. This method assumes that the other digit characters are checked
|
||||
* by the calling method.
|
||||
*
|
||||
* @param ch character to test
|
||||
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise its
|
||||
* corresponding digit will be returned.
|
||||
*/
|
||||
public static int getEuropeanDigit(int ch) {
|
||||
if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a
|
||||
|| (ch > 0xff3a && ch < 0xff41)) {
|
||||
return -1;
|
||||
}
|
||||
if (ch <= 0x7a) {
|
||||
// ch >= 0x41 or ch < 0x61
|
||||
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
|
||||
}
|
||||
// ch >= 0xff21
|
||||
if (ch <= 0xff3a) {
|
||||
return ch + 10 - 0xff21;
|
||||
}
|
||||
// ch >= 0xff41 && ch <= 0xff5a
|
||||
return ch + 10 - 0xff41;
|
||||
}
|
||||
|
||||
public int digit(int c) {
|
||||
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
|
||||
if (value <= 9) {
|
||||
return value;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// protected variables -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Extra property trie
|
||||
*/
|
||||
Trie2_16 m_additionalTrie_;
|
||||
/**
|
||||
* Extra property vectors, 1st column for age and second for binary properties.
|
||||
*/
|
||||
int m_additionalVectors_[];
|
||||
/**
|
||||
* Number of additional columns
|
||||
*/
|
||||
int m_additionalColumnsCount_;
|
||||
/**
|
||||
* Maximum values for block, bits used as in vector word 0
|
||||
*/
|
||||
int m_maxBlockScriptValue_;
|
||||
/**
|
||||
* Maximum values for script, bits used as in vector word 0
|
||||
*/
|
||||
int m_maxJTGValue_;
|
||||
/**
|
||||
* Script_Extensions data
|
||||
*/
|
||||
public char[] m_scriptExtensions_;
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Default name of the datafile
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private static final String DATA_FILE_NAME_ = "/assets/eagler/icudt/uprops.icu";
|
||||
|
||||
/**
|
||||
* Shift value for lead surrogate to form a supplementary character.
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_SHIFT_ = 10;
|
||||
/**
|
||||
* Offset to add to combined surrogate pair to avoid masking.
|
||||
*/
|
||||
private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE
|
||||
- (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE;
|
||||
|
||||
// property data constants -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Numeric types and values in the main properties words.
|
||||
*/
|
||||
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
|
||||
|
||||
private static final int getNumericTypeValue(int props) {
|
||||
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
|
||||
}
|
||||
|
||||
/* constants for the storage form of numeric types and values */
|
||||
/** No numeric value. */
|
||||
private static final int NTV_NONE_ = 0;
|
||||
/** Decimal digits: nv=0..9 */
|
||||
private static final int NTV_DECIMAL_START_ = 1;
|
||||
/** Other digits: nv=0..9 */
|
||||
private static final int NTV_DIGIT_START_ = 11;
|
||||
/** Small integers: nv=0..154 */
|
||||
private static final int NTV_NUMERIC_START_ = 21;
|
||||
|
||||
private static final int ntvGetType(int ntv) {
|
||||
return (ntv == NTV_NONE_) ? NumericType.NONE
|
||||
: (ntv < NTV_DIGIT_START_) ? NumericType.DECIMAL
|
||||
: (ntv < NTV_NUMERIC_START_) ? NumericType.DIGIT : NumericType.NUMERIC;
|
||||
}
|
||||
|
||||
/*
|
||||
* Properties in vector word 0 Bits 31..24 DerivedAge version major/minor one
|
||||
* nibble each 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 3:
|
||||
* Script value from Script_Extensions 2: Script=Inherited 1: Script=Common 0:
|
||||
* Script=bits 21..20 & 7..0 21..20 Bits 9..8 of the UScriptCode, or index to
|
||||
* Script_Extensions 19..17 East Asian Width 16.. 8 UBlockCode 7.. 0
|
||||
* UScriptCode, or index to Script_Extensions
|
||||
*/
|
||||
|
||||
/**
|
||||
* Script_Extensions: mask includes Script
|
||||
*/
|
||||
public static final int SCRIPT_X_MASK = 0x00f000ff;
|
||||
// private static final int SCRIPT_X_SHIFT = 22;
|
||||
|
||||
// The UScriptCode or Script_Extensions index is split across two bit fields.
|
||||
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
|
||||
// Shift the high bits right by 12 to assemble the full value.
|
||||
public static final int SCRIPT_HIGH_MASK = 0x00300000;
|
||||
public static final int SCRIPT_HIGH_SHIFT = 12;
|
||||
public static final int MAX_SCRIPT = 0x3ff;
|
||||
|
||||
/**
|
||||
* Integer properties mask and shift values for East Asian cell width.
|
||||
* Equivalent to icu4c UPROPS_EA_MASK
|
||||
*/
|
||||
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
|
||||
/**
|
||||
* Integer properties mask and shift values for East Asian cell width.
|
||||
* Equivalent to icu4c UPROPS_EA_SHIFT
|
||||
*/
|
||||
private static final int EAST_ASIAN_SHIFT_ = 17;
|
||||
/**
|
||||
* Integer properties mask and shift values for blocks. Equivalent to icu4c
|
||||
* UPROPS_BLOCK_MASK
|
||||
*/
|
||||
private static final int BLOCK_MASK_ = 0x0001ff00;
|
||||
/**
|
||||
* Integer properties mask and shift values for blocks. Equivalent to icu4c
|
||||
* UPROPS_BLOCK_SHIFT
|
||||
*/
|
||||
private static final int BLOCK_SHIFT_ = 8;
|
||||
/**
|
||||
* Integer properties mask and shift values for scripts. Equivalent to icu4c
|
||||
* UPROPS_SHIFT_LOW_MASK.
|
||||
*/
|
||||
public static final int SCRIPT_LOW_MASK = 0x000000ff;
|
||||
|
||||
public static final int mergeScriptCodeOrIndex(int scriptX) {
|
||||
return ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | (scriptX & SCRIPT_LOW_MASK);
|
||||
}
|
||||
|
||||
/**
|
||||
* Additional properties used in internal trie data
|
||||
*/
|
||||
/*
|
||||
* Properties in vector word 1 Each bit encodes one binary property. The
|
||||
* following constants represent the bit number, use 1<<UPROPS_XYZ.
|
||||
* UPROPS_BINARY_1_TOP<=32!
|
||||
*
|
||||
* Keep this list of property enums in sync with propListNames[] in
|
||||
* icu/source/tools/genprops/props2.c!
|
||||
*
|
||||
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
|
||||
*/
|
||||
private static final int WHITE_SPACE_PROPERTY_ = 0;
|
||||
private static final int DASH_PROPERTY_ = 1;
|
||||
private static final int HYPHEN_PROPERTY_ = 2;
|
||||
private static final int QUOTATION_MARK_PROPERTY_ = 3;
|
||||
private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
|
||||
private static final int MATH_PROPERTY_ = 5;
|
||||
private static final int HEX_DIGIT_PROPERTY_ = 6;
|
||||
private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
|
||||
private static final int ALPHABETIC_PROPERTY_ = 8;
|
||||
private static final int IDEOGRAPHIC_PROPERTY_ = 9;
|
||||
private static final int DIACRITIC_PROPERTY_ = 10;
|
||||
private static final int EXTENDER_PROPERTY_ = 11;
|
||||
private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
|
||||
private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
|
||||
private static final int GRAPHEME_LINK_PROPERTY_ = 14;
|
||||
private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
|
||||
private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
|
||||
private static final int RADICAL_PROPERTY_ = 17;
|
||||
private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
|
||||
private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
|
||||
private static final int DEPRECATED_PROPERTY_ = 20;
|
||||
private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
|
||||
private static final int XID_START_PROPERTY_ = 22;
|
||||
private static final int XID_CONTINUE_PROPERTY_ = 23;
|
||||
private static final int ID_START_PROPERTY_ = 24;
|
||||
private static final int ID_CONTINUE_PROPERTY_ = 25;
|
||||
private static final int GRAPHEME_BASE_PROPERTY_ = 26;
|
||||
private static final int S_TERM_PROPERTY_ = 27;
|
||||
private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
|
||||
private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
|
||||
private static final int PATTERN_WHITE_SPACE = 30;
|
||||
|
||||
/*
|
||||
* Properties in vector word 2 Bits 31..26 reserved 25..20 Line Break 19..15
|
||||
* Sentence Break 14..10 Word Break 9.. 5 Grapheme Cluster Break 4.. 0
|
||||
* Decomposition Type
|
||||
*/
|
||||
private static final int LB_MASK = 0x03f00000;
|
||||
private static final int LB_SHIFT = 20;
|
||||
|
||||
private static final int SB_MASK = 0x000f8000;
|
||||
private static final int SB_SHIFT = 15;
|
||||
|
||||
private static final int WB_MASK = 0x00007c00;
|
||||
private static final int WB_SHIFT = 10;
|
||||
|
||||
private static final int GCB_MASK = 0x000003e0;
|
||||
private static final int GCB_SHIFT = 5;
|
||||
|
||||
/**
|
||||
* Integer properties mask for decomposition type. Equivalent to icu4c
|
||||
* UPROPS_DT_MASK.
|
||||
*/
|
||||
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
|
||||
|
||||
/**
|
||||
* First nibble shift
|
||||
*/
|
||||
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
|
||||
/**
|
||||
* Second nibble mask
|
||||
*/
|
||||
private static final int LAST_NIBBLE_MASK_ = 0xF;
|
||||
/**
|
||||
* Age value shift
|
||||
*/
|
||||
private static final int AGE_SHIFT_ = 24;
|
||||
|
||||
// private constructors --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @exception IOException thrown when data reading fails or data corrupted
|
||||
*/
|
||||
private UCharacterProperty() throws IOException {
|
||||
// jar access
|
||||
ByteBuffer bytes = ICUBinary.getRequiredData(DATA_FILE_NAME_);
|
||||
m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
|
||||
// Read or skip the 16 indexes.
|
||||
int propertyOffset = bytes.getInt();
|
||||
/* exceptionOffset = */ bytes.getInt();
|
||||
/* caseOffset = */ bytes.getInt();
|
||||
int additionalOffset = bytes.getInt();
|
||||
int additionalVectorsOffset = bytes.getInt();
|
||||
m_additionalColumnsCount_ = bytes.getInt();
|
||||
int scriptExtensionsOffset = bytes.getInt();
|
||||
int reservedOffset7 = bytes.getInt();
|
||||
/* reservedOffset8 = */ bytes.getInt();
|
||||
/* dataTopOffset = */ bytes.getInt();
|
||||
m_maxBlockScriptValue_ = bytes.getInt();
|
||||
m_maxJTGValue_ = bytes.getInt();
|
||||
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
|
||||
|
||||
// read the main properties trie
|
||||
m_trie_ = Trie2_16.createFromSerialized(bytes);
|
||||
int expectedTrieLength = (propertyOffset - 16) * 4;
|
||||
int trieLength = m_trie_.getSerializedLength();
|
||||
if (trieLength > expectedTrieLength) {
|
||||
throw new IOException("uprops.icu: not enough bytes for main trie");
|
||||
}
|
||||
// skip padding after trie bytes
|
||||
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
|
||||
|
||||
// skip unused intervening data structures
|
||||
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
|
||||
|
||||
if (m_additionalColumnsCount_ > 0) {
|
||||
// reads the additional property block
|
||||
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
|
||||
expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4;
|
||||
trieLength = m_additionalTrie_.getSerializedLength();
|
||||
if (trieLength > expectedTrieLength) {
|
||||
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
|
||||
}
|
||||
// skip padding after trie bytes
|
||||
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
|
||||
|
||||
// additional properties
|
||||
int size = scriptExtensionsOffset - additionalVectorsOffset;
|
||||
m_additionalVectors_ = new int[size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
m_additionalVectors_[i] = bytes.getInt();
|
||||
}
|
||||
}
|
||||
|
||||
// Script_Extensions
|
||||
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
|
||||
if (numChars > 0) {
|
||||
m_scriptExtensions_ = new char[numChars];
|
||||
for (int i = 0; i < numChars; ++i) {
|
||||
m_scriptExtensions_[i] = bytes.getChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
// @Override when we switch to Java 6
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0] == 7;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
|
||||
|
||||
public void upropsvec_addPropertyStarts(UnicodeSet set) {
|
||||
/*
|
||||
* add the start code point of each same-value range of the properties vectors
|
||||
* trie
|
||||
*/
|
||||
if (m_additionalColumnsCount_ > 0) {
|
||||
/*
|
||||
* if m_additionalColumnsCount_==0 then the properties vectors trie may not be
|
||||
* there at all
|
||||
*/
|
||||
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
|
||||
Trie2.Range range;
|
||||
while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
|
||||
set.add(range.startCodePoint);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This static initializer block must be placed after
|
||||
// other static member initialization
|
||||
static {
|
||||
try {
|
||||
INSTANCE = new UCharacterProperty();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Missing resource: \"" + DATA_FILE_NAME_ + "\"; Reason: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
// Moved from UProperty.java
|
||||
/**
|
||||
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). Used in
|
||||
* UAX #9: Unicode Bidirectional Algorithm (http://www.unicode.org/reports/tr9/)
|
||||
* Returns UCharacter.BidiPairedBracketType values.
|
||||
*
|
||||
* @stable ICU 52
|
||||
*/
|
||||
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
|
||||
|
||||
}
|
1179
sources/main/java/jdk_internal/icu/impl/UnicodeSetStringSpan.java
Normal file
1179
sources/main/java/jdk_internal/icu/impl/UnicodeSetStringSpan.java
Normal file
File diff suppressed because it is too large
Load Diff
266
sources/main/java/jdk_internal/icu/impl/Utility.java
Normal file
266
sources/main/java/jdk_internal/icu/impl/Utility.java
Normal file
@ -0,0 +1,266 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2011, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
|
||||
public final class Utility {
|
||||
|
||||
/**
|
||||
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and
|
||||
* convert backslash to a double backslash.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
for (int i = 0; i < s.length();) {
|
||||
int c = Character.codePointAt(s, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
if (c == '\\') {
|
||||
buf.append("\\\\"); // That is, "\\"
|
||||
} else {
|
||||
buf.append((char) c);
|
||||
}
|
||||
} else {
|
||||
boolean four = c <= 0xFFFF;
|
||||
buf.append(four ? "\\u" : "\\U");
|
||||
buf.append(hex(c, four ? 4 : 8));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
|
||||
private static final char[] UNESCAPE_MAP = {
|
||||
/* " 0x22, 0x22 */
|
||||
/* ' 0x27, 0x27 */
|
||||
/* ? 0x3F, 0x3F */
|
||||
/* \ 0x5C, 0x5C */
|
||||
/* a */ 0x61, 0x07, /* b */ 0x62, 0x08, /* e */ 0x65, 0x1b, /* f */ 0x66, 0x0c, /* n */ 0x6E, 0x0a,
|
||||
/* r */ 0x72, 0x0d, /* t */ 0x74, 0x09, /* v */ 0x76, 0x0b };
|
||||
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt to parallel the
|
||||
* icu4c unescapeAt() function.
|
||||
*
|
||||
* @param offset16 an array containing offset to the character <em>after</em>
|
||||
* the backslash. Upon return offset16[0] will be updated to
|
||||
* point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
boolean braces = false;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = Character.codePointAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
if (offset < length && UTF16.charAt(s, offset) == 0x7B /* { */) {
|
||||
++offset;
|
||||
braces = true;
|
||||
maxDig = 8;
|
||||
} else {
|
||||
maxDig = 2;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
n = 1; /* Already have first octal digit */
|
||||
bitsPerDigit = 3;
|
||||
result = dig;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
offset += UTF16.getCharCount(c);
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
if (braces) {
|
||||
if (c != 0x7D /* } */) {
|
||||
return -1;
|
||||
}
|
||||
++offset;
|
||||
}
|
||||
if (result < 0 || result >= 0x110000) {
|
||||
return -1;
|
||||
}
|
||||
// If an escape sequence specifies a lead surrogate, see
|
||||
// if there is a trail surrogate after it, either as an
|
||||
// escape or as a literal. If so, join them up into a
|
||||
// supplementary.
|
||||
if (offset < length && UTF16.isLeadSurrogate((char) result)) {
|
||||
int ahead = offset + 1;
|
||||
c = s.charAt(offset); // [sic] get 16-bit code unit
|
||||
if (c == '\\' && ahead < length) {
|
||||
int o[] = new int[] { ahead };
|
||||
c = unescapeAt(s, o);
|
||||
ahead = o[0];
|
||||
}
|
||||
if (UTF16.isTrailSurrogate((char) c)) {
|
||||
offset = ahead;
|
||||
result = UCharacterProperty.getRawSupplementary((char) result, (char) c);
|
||||
}
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i = 0; i < UNESCAPE_MAP.length; i += 2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i + 1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Map \cX to control-X: X & 0x1F */
|
||||
if (c == 'c' && offset < length) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset16[0] = offset + UTF16.getCharCount(c);
|
||||
return 0x1F & c;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no special forms are recognized, then consider the backslash to
|
||||
* generically escape the next character.
|
||||
*/
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of an integer (without 0x)
|
||||
*/
|
||||
public static String hex(long i, int places) {
|
||||
if (i == Long.MIN_VALUE)
|
||||
return "-8000000000000000";
|
||||
boolean negative = i < 0;
|
||||
if (negative) {
|
||||
i = -i;
|
||||
}
|
||||
String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
|
||||
if (result.length() < places) {
|
||||
result = "0000000000000000".substring(result.length(), places) + result;
|
||||
}
|
||||
if (negative) {
|
||||
return '-' + result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static final char DIGITS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
|
||||
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };
|
||||
|
||||
/**
|
||||
* Return true if the character is NOT printable ASCII. The tab, newline and
|
||||
* linefeed characters are considered unprintable.
|
||||
*/
|
||||
public static boolean isUnprintable(int c) {
|
||||
// 0x20 = 32 and 0x7E = 126
|
||||
return !(c >= 0x20 && c <= 0x7E);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape unprintable characters using <backslash>uxxxx notation for U+0000 to
|
||||
* U+FFFF and <backslash>Uxxxxxxxx for U+10000 and above. If the character is
|
||||
* printable ASCII, then do nothing and return FALSE. Otherwise, append the
|
||||
* escaped notation and return TRUE.
|
||||
*/
|
||||
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
|
||||
try {
|
||||
if (isUnprintable(c)) {
|
||||
result.append('\\');
|
||||
if ((c & ~0xFFFF) != 0) {
|
||||
result.append('U');
|
||||
result.append(DIGITS[0xF & (c >> 28)]);
|
||||
result.append(DIGITS[0xF & (c >> 24)]);
|
||||
result.append(DIGITS[0xF & (c >> 20)]);
|
||||
result.append(DIGITS[0xF & (c >> 16)]);
|
||||
} else {
|
||||
result.append('u');
|
||||
}
|
||||
result.append(DIGITS[0xF & (c >> 12)]);
|
||||
result.append(DIGITS[0xF & (c >> 8)]);
|
||||
result.append(DIGITS[0xF & (c >> 4)]);
|
||||
result.append(DIGITS[0xF & c]);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
562
sources/main/java/jdk_internal/icu/lang/UCharacter.java
Normal file
562
sources/main/java/jdk_internal/icu/lang/UCharacter.java
Normal file
@ -0,0 +1,562 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.lang;
|
||||
|
||||
import jdk_internal.icu.impl.UBiDiProps;
|
||||
import jdk_internal.icu.impl.UCharacterProperty;
|
||||
import jdk_internal.icu.text.Normalizer2;
|
||||
import jdk_internal.icu.text.UTF16;
|
||||
import jdk_internal.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The UCharacter class provides extensions to the
|
||||
* <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
|
||||
* java.lang.Character</a> class. These extensions provide support for more
|
||||
* Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
|
||||
* class, provide support for supplementary characters (those with code points
|
||||
* above U+FFFF). Each ICU release supports the latest version of Unicode
|
||||
* available at that time.
|
||||
*
|
||||
* <p>
|
||||
* Code points are represented in these API using ints. While it would be more
|
||||
* convenient in Java to have a separate primitive datatype for them, ints
|
||||
* suffice in the meantime.
|
||||
*
|
||||
* <p>
|
||||
* To use this class please add the jar file name icu4j.jar to the class path,
|
||||
* since it contains data files which supply the information used by this
|
||||
* file.<br>
|
||||
* E.g. In Windows <br>
|
||||
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
|
||||
* Otherwise, another method would be to copy the files uprops.dat and
|
||||
* unames.icu from the icu4j source subdirectory
|
||||
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
|
||||
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
|
||||
*
|
||||
* <p>
|
||||
* Aside from the additions for UTF-16 support, and the updated Unicode
|
||||
* properties, the main differences between UCharacter and Character are:
|
||||
* <ul>
|
||||
* <li>UCharacter is not designed to be a char wrapper and does not have APIs to
|
||||
* which involves management of that single char.<br>
|
||||
* These include:
|
||||
* <ul>
|
||||
* <li>char charValue(),
|
||||
* <li>int compareTo(java.lang.Character, java.lang.Character), etc.
|
||||
* </ul>
|
||||
* <li>UCharacter does not include Character APIs that are deprecated, nor does
|
||||
* it include the Java-specific character information, such as boolean
|
||||
* isJavaIdentifierPart(char ch).
|
||||
* <li>Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric values
|
||||
* '10' - '35'. UCharacter also does this in digit and getNumericValue, to
|
||||
* adhere to the java semantics of these methods. New methods unicodeDigit, and
|
||||
* getUnicodeNumericValue do not treat the above code points as having numeric
|
||||
* values. This is a semantic change from ICU4J 1.3.1.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Further detail on differences can be determined using the program <a href=
|
||||
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
|
||||
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
|
||||
* </p>
|
||||
* <p>
|
||||
* In addition to Java compatibility functions, which calculate derived
|
||||
* properties, this API provides low-level access to the Unicode Character
|
||||
* Database.
|
||||
* </p>
|
||||
* <p>
|
||||
* Unicode assigns each code point (not just assigned character) values for many
|
||||
* properties. Most of them are simple boolean flags, or constants from a small
|
||||
* enumerated list. For some properties, values are strings or other relatively
|
||||
* more complex types.
|
||||
* </p>
|
||||
* <p>
|
||||
* For more information see <a href="http://www.unicode/org/ucd/">"About the
|
||||
* Unicode Character Database"</a> (http://www.unicode.org/ucd/) and the
|
||||
* <a href="http://www.icu-project.org/userguide/properties.html">ICU User Guide
|
||||
* chapter on Properties</a>
|
||||
* (http://www.icu-project.org/userguide/properties.html).
|
||||
* </p>
|
||||
* <p>
|
||||
* There are also functions that provide easy migration from C/POSIX functions
|
||||
* like isblank(). Their use is generally discouraged because the C/POSIX
|
||||
* standards do not define their semantics beyond the ASCII range, which means
|
||||
* that different implementations exhibit very different behavior. Instead,
|
||||
* Unicode properties should be used directly.
|
||||
* </p>
|
||||
* <p>
|
||||
* There are also only a few, broad C/POSIX character classes, and they tend to
|
||||
* be used for conflicting purposes. For example, the "isalpha()" class is
|
||||
* sometimes used to determine word boundaries, while a more sophisticated
|
||||
* approach would at least distinguish initial letters from continuation
|
||||
* characters (the latter including combining marks). (In ICU, BreakIterator is
|
||||
* the most sophisticated API for word boundaries.) Another example: There is no
|
||||
* "istitle()" class for titlecase characters.
|
||||
* </p>
|
||||
* <p>
|
||||
* ICU 3.4 and later provides API access for all twelve C/POSIX character
|
||||
* classes. ICU implements them according to the Standard Recommendations in
|
||||
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
|
||||
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
* </p>
|
||||
* <p>
|
||||
* API access for C/POSIX character classes is as follows:
|
||||
*
|
||||
* <pre>{@code
|
||||
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
|
||||
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
|
||||
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
|
||||
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
|
||||
* (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
|
||||
* (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
|
||||
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
|
||||
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
|
||||
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
|
||||
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
|
||||
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
|
||||
* - cntrl: getType(c)==CONTROL
|
||||
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
|
||||
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
|
||||
* }</pre>
|
||||
* </p>
|
||||
* <p>
|
||||
* The C/POSIX character classes are also available in UnicodeSet patterns,
|
||||
* using patterns like [:graph:] or \p{graph}.
|
||||
* </p>
|
||||
*
|
||||
* There are several ICU (and Java) whitespace functions. Comparison:
|
||||
* <ul>
|
||||
* <li>isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; most of
|
||||
* general categories "Z" (separators) + most whitespace ISO controls (including
|
||||
* no-break spaces, but excluding IS1..IS4 and ZWSP)
|
||||
* <li>isWhitespace: Java isWhitespace; Z + whitespace ISO controls but
|
||||
* excluding no-break spaces
|
||||
* <li>isSpaceChar: just Z (including no-break spaces)
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* This class is not subclassable.
|
||||
* </p>
|
||||
*
|
||||
* @author Syn Wee Quek
|
||||
* @stable ICU 2.1
|
||||
* @see com.ibm.icu.lang.UCharacterEnums
|
||||
*/
|
||||
|
||||
public final class UCharacter {
|
||||
|
||||
/**
|
||||
* Joining Group constants.
|
||||
*
|
||||
* @see UProperty#JOINING_GROUP
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static interface JoiningGroup {
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int NO_JOINING_GROUP = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Numeric Type constants.
|
||||
*
|
||||
* @see UProperty#NUMERIC_TYPE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static interface NumericType {
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int NONE = 0;
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DECIMAL = 1;
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DIGIT = 2;
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int NUMERIC = 3;
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int COUNT = 4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Hangul Syllable Type constants.
|
||||
*
|
||||
* @see UProperty#HANGUL_SYLLABLE_TYPE
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static interface HangulSyllableType {
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int NOT_APPLICABLE = 0; /* [NA] */ /* See note !! */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int LEADING_JAMO = 1; /* [L] */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int VOWEL_JAMO = 2; /* [V] */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int TRAILING_JAMO = 3; /* [T] */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int LV_SYLLABLE = 4; /* [LV] */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int LVT_SYLLABLE = 5; /* [LVT] */
|
||||
/**
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int COUNT = 6;
|
||||
}
|
||||
|
||||
// public data members -----------------------------------------------
|
||||
|
||||
/**
|
||||
* The lowest Unicode code point value.
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
|
||||
|
||||
/**
|
||||
* The highest Unicode code point value (scalar value) according to the Unicode
|
||||
* Standard. This is a 21-bit value (21 bits, rounded up).<br>
|
||||
* Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the numeric value of a decimal digit code point. <br>
|
||||
* This method observes the semantics of
|
||||
* <code>java.lang.Character.digit()</code>. Note that this will return positive
|
||||
* values for code points for which isDigit returns false, just like
|
||||
* java.lang.Character. <br>
|
||||
* <em>Semantic Change:</em> In release 1.3.1 and prior, this did not treat the
|
||||
* European letters as having a digit value, and also treated numeric letters
|
||||
* and other numbers as digits. This has been changed to conform to the java
|
||||
* semantics. <br>
|
||||
* A code point is a valid digit if and only if:
|
||||
* <ul>
|
||||
* <li>ch is a decimal digit or one of the european letters, and
|
||||
* <li>the value of ch is less than the specified radix.
|
||||
* </ul>
|
||||
*
|
||||
* @param ch the code point to query
|
||||
* @param radix the radix
|
||||
* @return the numeric value represented by the code point in the specified
|
||||
* radix, or -1 if the code point is not a decimal digit or if its value
|
||||
* is too large for the radix
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int digit(int ch, int radix) {
|
||||
if (2 <= radix && radix <= 36) {
|
||||
int value = digit(ch);
|
||||
if (value < 0) {
|
||||
// ch is not a decimal digit, try latin letters
|
||||
value = UCharacterProperty.getEuropeanDigit(ch);
|
||||
}
|
||||
return (value < radix) ? value : -1;
|
||||
} else {
|
||||
return -1; // invalid radix
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the numeric value of a decimal digit code point. <br>
|
||||
* This is a convenience overload of <code>digit(int, int)</code> that provides
|
||||
* a decimal radix. <br>
|
||||
* <em>Semantic Change:</em> In release 1.3.1 and prior, this treated numeric
|
||||
* letters and other numbers as digits. This has been changed to conform to the
|
||||
* java semantics.
|
||||
*
|
||||
* @param ch the code point to query
|
||||
* @return the numeric value represented by the code point, or -1 if the code
|
||||
* point is not a decimal digit or if its value is too large for a
|
||||
* decimal radix
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int digit(int ch) {
|
||||
return UCharacterProperty.INSTANCE.digit(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a value indicating a code point's Unicode category. Up-to-date
|
||||
* Unicode implementation of java.lang.Character.getType() except for the above
|
||||
* mentioned code points that had their category changed.<br>
|
||||
* Return results are constants from the interface
|
||||
* <a href=UCharacterCategory.html>UCharacterCategory</a><br>
|
||||
* <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
|
||||
* those returned by java.lang.Character.getType. UCharacterCategory values
|
||||
* match the ones used in ICU4C, while java.lang.Character type values, though
|
||||
* similar, skip the value 17.
|
||||
* </p>
|
||||
*
|
||||
* @param ch code point whose type is to be determined
|
||||
* @return category which is a value of UCharacterCategory
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getType(int ch) {
|
||||
return UCharacterProperty.INSTANCE.getType(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Bidirection property of a code point. For example, 0x0041 (letter
|
||||
* A) has the LEFT_TO_RIGHT directional property.<br>
|
||||
* Result returned belongs to the interface
|
||||
* <a href=UCharacterDirection.html>UCharacterDirection</a>
|
||||
*
|
||||
* @param ch the code point to be determined its direction
|
||||
* @return direction constant from UCharacterDirection.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getDirection(int ch) {
|
||||
return UBiDiProps.INSTANCE.getClass(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps the specified code point to a "mirror-image" code point. For code points
|
||||
* with the "mirrored" property, implementations sometimes need a "poor man's"
|
||||
* mapping to another code point such that the default glyph may serve as the
|
||||
* mirror-image of the default glyph of the specified code point.<br>
|
||||
* This is useful for text conversion to and from codepages with visual order,
|
||||
* and for displays without glyph selection capabilities.
|
||||
*
|
||||
* @param ch code point whose mirror is to be retrieved
|
||||
* @return another code point that may serve as a mirror-image substitute, or ch
|
||||
* itself if there is no such mapping or ch does not have the "mirrored"
|
||||
* property
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getMirror(int ch) {
|
||||
return UBiDiProps.INSTANCE.getMirror(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps the specified character to its paired bracket character. For
|
||||
* Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). Otherwise
|
||||
* c itself is returned. See http://www.unicode.org/reports/tr9/
|
||||
*
|
||||
* @param c the code point to be mapped
|
||||
* @return the paired bracket code point, or c itself if there is no such
|
||||
* mapping (Bidi_Paired_Bracket_Type=None)
|
||||
*
|
||||
* @see UProperty#BIDI_PAIRED_BRACKET
|
||||
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
|
||||
* @see #getMirror(int)
|
||||
* @stable ICU 52
|
||||
*/
|
||||
public static int getBidiPairedBracket(int c) {
|
||||
return UBiDiProps.INSTANCE.getPairedBracket(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the combining class of the argument codepoint
|
||||
*
|
||||
* @param ch code point whose combining is to be retrieved
|
||||
* @return the combining class of the codepoint
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCombiningClass(int ch) {
|
||||
return Normalizer2.getNFDInstance().getCombiningClass(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the version of Unicode data used.
|
||||
*
|
||||
* @return the unicode version number used
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static VersionInfo getUnicodeVersion() {
|
||||
return UCharacterProperty.INSTANCE.m_unicodeVersion_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a code point corresponding to the two UTF16 characters.
|
||||
*
|
||||
* @param lead the lead char
|
||||
* @param trail the trail char
|
||||
* @return code point if surrogate characters are valid.
|
||||
* @exception IllegalArgumentException thrown when argument characters do not
|
||||
* form a valid codepoint
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCodePoint(char lead, char trail) {
|
||||
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, trail);
|
||||
}
|
||||
throw new IllegalArgumentException("Illegal surrogate characters");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the "age" of the code point.
|
||||
* </p>
|
||||
* <p>
|
||||
* The "age" is the Unicode version when the code point was first designated (as
|
||||
* a non-character or for Private Use) or assigned a character.
|
||||
* <p>
|
||||
* This can be useful to avoid emitting code points to receiving processes that
|
||||
* do not accept newer characters.
|
||||
* </p>
|
||||
* <p>
|
||||
* The data is from the UCD file DerivedAge.txt.
|
||||
* </p>
|
||||
*
|
||||
* @param ch The code point.
|
||||
* @return the Unicode version number
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getAge(int ch) {
|
||||
if (ch < MIN_VALUE || ch > MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Codepoint out of bounds");
|
||||
}
|
||||
return UCharacterProperty.INSTANCE.getAge(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the property value for an Unicode property type of a code point. Also
|
||||
* returns binary and mask property values.
|
||||
* </p>
|
||||
* <p>
|
||||
* Unicode, especially in version 3.2, defines many more properties than the
|
||||
* original set in UnicodeData.txt.
|
||||
* </p>
|
||||
* <p>
|
||||
* The properties APIs are intended to reflect Unicode properties as defined in
|
||||
* the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For
|
||||
* details about the properties see http://www.unicode.org/.
|
||||
* </p>
|
||||
* <p>
|
||||
* For names of Unicode properties see the UCD file PropertyAliases.txt.
|
||||
* </p>
|
||||
*
|
||||
* <pre>
|
||||
* Sample usage:
|
||||
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
|
||||
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
|
||||
* boolean b = (ideo == 1) ? true : false;
|
||||
* </pre>
|
||||
*
|
||||
* @param ch code point to test.
|
||||
* @param type UProperty selector constant, identifies which binary property to
|
||||
* check. Must be UProperty.BINARY_START <= type <
|
||||
* UProperty.BINARY_LIMIT or UProperty.INT_START <= type <
|
||||
* UProperty.INT_LIMIT or UProperty.MASK_START <= type <
|
||||
* UProperty.MASK_LIMIT.
|
||||
* @return numeric value that is directly the property value or, for enumerated
|
||||
* properties, corresponds to the numeric value of the enumerated
|
||||
* constant of the respective property value enumeration type (cast to
|
||||
* enum type if necessary). Returns 0 or 1 (for false / true) for binary
|
||||
* Unicode properties. Returns a bit-mask for mask properties. Returns 0
|
||||
* if 'type' is out of bounds or if the Unicode version does not have
|
||||
* data for the property at all, or not for this code point.
|
||||
* @see UProperty
|
||||
* @see #hasBinaryProperty
|
||||
* @see #getIntPropertyMinValue
|
||||
* @see #getIntPropertyMaxValue
|
||||
* @see #getUnicodeVersion
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
// for BiDiBase.java
|
||||
public static int getIntPropertyValue(int ch, int type) {
|
||||
return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
|
||||
}
|
||||
|
||||
// private constructor -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Private constructor to prevent instantiation
|
||||
*/
|
||||
private UCharacter() {
|
||||
}
|
||||
|
||||
/*
|
||||
* Copied from UCharacterEnums.java
|
||||
*/
|
||||
|
||||
/**
|
||||
* Character type Mn
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final byte NON_SPACING_MARK = 6;
|
||||
/**
|
||||
* Character type Me
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final byte ENCLOSING_MARK = 7;
|
||||
/**
|
||||
* Character type Mc
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final byte COMBINING_SPACING_MARK = 8;
|
||||
/**
|
||||
* Character type count
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final byte CHAR_CATEGORY_COUNT = 30;
|
||||
|
||||
/**
|
||||
* Directional type R
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT = 1;
|
||||
/**
|
||||
* Directional type AL
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT_ARABIC = 13;
|
||||
}
|
113
sources/main/java/jdk_internal/icu/lang/UCharacterDirection.java
Normal file
113
sources/main/java/jdk_internal/icu/lang/UCharacterDirection.java
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2004, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterDirection.java
|
||||
// - move from package com.ibm.icu.lang to package sun.net.idn
|
||||
//
|
||||
|
||||
package jdk_internal.icu.lang;
|
||||
|
||||
/**
|
||||
* Enumerated Unicode character linguistic direction constants. Used as return
|
||||
* results from <a href=UCharacter.html>UCharacter</a>
|
||||
* <p>
|
||||
* This class is not subclassable
|
||||
* </p>
|
||||
*
|
||||
* @author Syn Wee Quek
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public final class UCharacterDirection implements UCharacterEnums.ECharacterDirection {
|
||||
|
||||
// private constructor =========================================
|
||||
/// CLOVER:OFF
|
||||
/**
|
||||
* Private constructor to prevent initialisation
|
||||
*/
|
||||
private UCharacterDirection() {
|
||||
}
|
||||
/// CLOVER:ON
|
||||
|
||||
/**
|
||||
* Gets the name of the argument direction
|
||||
*
|
||||
* @param dir direction type to retrieve name
|
||||
* @return directional name
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static String toString(int dir) {
|
||||
switch (dir) {
|
||||
case LEFT_TO_RIGHT:
|
||||
return "Left-to-Right";
|
||||
case RIGHT_TO_LEFT:
|
||||
return "Right-to-Left";
|
||||
case EUROPEAN_NUMBER:
|
||||
return "European Number";
|
||||
case EUROPEAN_NUMBER_SEPARATOR:
|
||||
return "European Number Separator";
|
||||
case EUROPEAN_NUMBER_TERMINATOR:
|
||||
return "European Number Terminator";
|
||||
case ARABIC_NUMBER:
|
||||
return "Arabic Number";
|
||||
case COMMON_NUMBER_SEPARATOR:
|
||||
return "Common Number Separator";
|
||||
case BLOCK_SEPARATOR:
|
||||
return "Paragraph Separator";
|
||||
case SEGMENT_SEPARATOR:
|
||||
return "Segment Separator";
|
||||
case WHITE_SPACE_NEUTRAL:
|
||||
return "Whitespace";
|
||||
case OTHER_NEUTRAL:
|
||||
return "Other Neutrals";
|
||||
case LEFT_TO_RIGHT_EMBEDDING:
|
||||
return "Left-to-Right Embedding";
|
||||
case LEFT_TO_RIGHT_OVERRIDE:
|
||||
return "Left-to-Right Override";
|
||||
case RIGHT_TO_LEFT_ARABIC:
|
||||
return "Right-to-Left Arabic";
|
||||
case RIGHT_TO_LEFT_EMBEDDING:
|
||||
return "Right-to-Left Embedding";
|
||||
case RIGHT_TO_LEFT_OVERRIDE:
|
||||
return "Right-to-Left Override";
|
||||
case POP_DIRECTIONAL_FORMAT:
|
||||
return "Pop Directional Format";
|
||||
case DIR_NON_SPACING_MARK:
|
||||
return "Non-Spacing Mark";
|
||||
case BOUNDARY_NEUTRAL:
|
||||
return "Boundary Neutral";
|
||||
}
|
||||
return "Unassigned";
|
||||
}
|
||||
}
|
666
sources/main/java/jdk_internal/icu/lang/UCharacterEnums.java
Normal file
666
sources/main/java/jdk_internal/icu/lang/UCharacterEnums.java
Normal file
@ -0,0 +1,666 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterEnums.java
|
||||
// - move from package com.ibm.icu.lang to package sun.net.idn
|
||||
//
|
||||
// 2011-09-06 Kurchi Subhra Hazra
|
||||
// - Added @Deprecated tag to the following:
|
||||
// - class UCharacterEnums
|
||||
// - interfaces ECharacterCategory, ECharacterDirection
|
||||
// - fields INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION,
|
||||
// DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT,
|
||||
// DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
|
||||
// DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER,
|
||||
// DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_PARAGRAPH_SEPARATOR,
|
||||
// DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE,
|
||||
// DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
|
||||
// DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
|
||||
// DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
|
||||
// DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DIRECTIONALITY_NON_SPACING_MARK,
|
||||
// DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_UNDEFINED
|
||||
//
|
||||
|
||||
package jdk_internal.icu.lang;
|
||||
|
||||
/**
|
||||
* A container for the different 'enumerated types' used by UCharacter.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
class UCharacterEnums {
|
||||
|
||||
/** This is just a namespace, it is not instantiatable. */
|
||||
private UCharacterEnums() {
|
||||
};
|
||||
|
||||
/**
|
||||
* 'Enum' for the CharacterCategory constants. These constants are compatible in
|
||||
* name <b>but not in value</b> with those defined in
|
||||
* <code>java.lang.Character</code>.
|
||||
*
|
||||
* @see UCharacterCategory
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static interface ECharacterCategory {
|
||||
/**
|
||||
* Unassigned character type
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int UNASSIGNED = 0;
|
||||
|
||||
/**
|
||||
* Character type Cn Not Assigned (no characters in [UnicodeData.txt] have this
|
||||
* property)
|
||||
*
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int GENERAL_OTHER_TYPES = 0;
|
||||
|
||||
/**
|
||||
* Character type Lu
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int UPPERCASE_LETTER = 1;
|
||||
|
||||
/**
|
||||
* Character type Ll
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LOWERCASE_LETTER = 2;
|
||||
|
||||
/**
|
||||
* Character type Lt
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
|
||||
public static final int TITLECASE_LETTER = 3;
|
||||
|
||||
/**
|
||||
* Character type Lm
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MODIFIER_LETTER = 4;
|
||||
|
||||
/**
|
||||
* Character type Lo
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int OTHER_LETTER = 5;
|
||||
|
||||
/**
|
||||
* Character type Mn
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int NON_SPACING_MARK = 6;
|
||||
|
||||
/**
|
||||
* Character type Me
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int ENCLOSING_MARK = 7;
|
||||
|
||||
/**
|
||||
* Character type Mc
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int COMBINING_SPACING_MARK = 8;
|
||||
|
||||
/**
|
||||
* Character type Nd
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int DECIMAL_DIGIT_NUMBER = 9;
|
||||
|
||||
/**
|
||||
* Character type Nl
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LETTER_NUMBER = 10;
|
||||
|
||||
/**
|
||||
* Character type No
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int OTHER_NUMBER = 11;
|
||||
|
||||
/**
|
||||
* Character type Zs
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SPACE_SEPARATOR = 12;
|
||||
|
||||
/**
|
||||
* Character type Zl
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LINE_SEPARATOR = 13;
|
||||
|
||||
/**
|
||||
* Character type Zp
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int PARAGRAPH_SEPARATOR = 14;
|
||||
|
||||
/**
|
||||
* Character type Cc
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CONTROL = 15;
|
||||
|
||||
/**
|
||||
* Character type Cf
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int FORMAT = 16;
|
||||
|
||||
/**
|
||||
* Character type Co
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int PRIVATE_USE = 17;
|
||||
|
||||
/**
|
||||
* Character type Cs
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SURROGATE = 18;
|
||||
|
||||
/**
|
||||
* Character type Pd
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int DASH_PUNCTUATION = 19;
|
||||
|
||||
/**
|
||||
* Character type Ps
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int START_PUNCTUATION = 20;
|
||||
|
||||
/**
|
||||
* Character type Pe
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int END_PUNCTUATION = 21;
|
||||
|
||||
/**
|
||||
* Character type Pc
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CONNECTOR_PUNCTUATION = 22;
|
||||
|
||||
/**
|
||||
* Character type Po
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int OTHER_PUNCTUATION = 23;
|
||||
|
||||
/**
|
||||
* Character type Sm
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MATH_SYMBOL = 24;
|
||||
|
||||
/**
|
||||
* Character type Sc
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CURRENCY_SYMBOL = 25;
|
||||
|
||||
/**
|
||||
* Character type Sk
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MODIFIER_SYMBOL = 26;
|
||||
|
||||
/**
|
||||
* Character type So
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int OTHER_SYMBOL = 27;
|
||||
|
||||
/**
|
||||
* Character type Pi
|
||||
*
|
||||
* @see #INITIAL_QUOTE_PUNCTUATION
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int INITIAL_PUNCTUATION = 28;
|
||||
|
||||
/**
|
||||
* Character type Pi This name is compatible with java.lang.Character's name for
|
||||
* this type.
|
||||
*
|
||||
* @see #INITIAL_PUNCTUATION
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int INITIAL_QUOTE_PUNCTUATION = 28;
|
||||
|
||||
/**
|
||||
* Character type Pf
|
||||
*
|
||||
* @see #FINAL_QUOTE_PUNCTUATION
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int FINAL_PUNCTUATION = 29;
|
||||
|
||||
/**
|
||||
* Character type Pf This name is compatible with java.lang.Character's name for
|
||||
* this type.
|
||||
*
|
||||
* @see #FINAL_PUNCTUATION
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final int FINAL_QUOTE_PUNCTUATION = 29;
|
||||
|
||||
/**
|
||||
* Character type count
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CHAR_CATEGORY_COUNT = 30;
|
||||
}
|
||||
|
||||
/**
|
||||
* 'Enum' for the CharacterDirection constants. There are two sets of names,
|
||||
* those used in ICU, and those used in the JDK. The JDK constants are
|
||||
* compatible in name <b>but not in value</b> with those defined in
|
||||
* <code>java.lang.Character</code>.
|
||||
*
|
||||
* @see UCharacterDirection
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public static interface ECharacterDirection {
|
||||
/**
|
||||
* Directional type L
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEFT_TO_RIGHT = 0;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for LEFT_TO_RIGHT.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = (byte) LEFT_TO_RIGHT;
|
||||
|
||||
/**
|
||||
* Directional type R
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT = 1;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for RIGHT_TO_LEFT.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = (byte) RIGHT_TO_LEFT;
|
||||
|
||||
/**
|
||||
* Directional type EN
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int EUROPEAN_NUMBER = 2;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for EUROPEAN_NUMBER.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = (byte) EUROPEAN_NUMBER;
|
||||
|
||||
/**
|
||||
* Directional type ES
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int EUROPEAN_NUMBER_SEPARATOR = 3;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for EUROPEAN_NUMBER_SEPARATOR.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = (byte) EUROPEAN_NUMBER_SEPARATOR;
|
||||
|
||||
/**
|
||||
* Directional type ET
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int EUROPEAN_NUMBER_TERMINATOR = 4;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for EUROPEAN_NUMBER_TERMINATOR.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = (byte) EUROPEAN_NUMBER_TERMINATOR;
|
||||
|
||||
/**
|
||||
* Directional type AN
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int ARABIC_NUMBER = 5;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for ARABIC_NUMBER.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_ARABIC_NUMBER = (byte) ARABIC_NUMBER;
|
||||
|
||||
/**
|
||||
* Directional type CS
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int COMMON_NUMBER_SEPARATOR = 6;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for COMMON_NUMBER_SEPARATOR.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = (byte) COMMON_NUMBER_SEPARATOR;
|
||||
|
||||
/**
|
||||
* Directional type B
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int BLOCK_SEPARATOR = 7;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for BLOCK_SEPARATOR.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = (byte) BLOCK_SEPARATOR;
|
||||
|
||||
/**
|
||||
* Directional type S
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SEGMENT_SEPARATOR = 8;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for SEGMENT_SEPARATOR.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = (byte) SEGMENT_SEPARATOR;
|
||||
|
||||
/**
|
||||
* Directional type WS
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int WHITE_SPACE_NEUTRAL = 9;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for WHITE_SPACE_NEUTRAL.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_WHITESPACE = (byte) WHITE_SPACE_NEUTRAL;
|
||||
|
||||
/**
|
||||
* Directional type ON
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int OTHER_NEUTRAL = 10;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for OTHER_NEUTRAL.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_OTHER_NEUTRALS = (byte) OTHER_NEUTRAL;
|
||||
|
||||
/**
|
||||
* Directional type LRE
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEFT_TO_RIGHT_EMBEDDING = 11;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for LEFT_TO_RIGHT_EMBEDDING.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = (byte) LEFT_TO_RIGHT_EMBEDDING;
|
||||
|
||||
/**
|
||||
* Directional type LRO
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEFT_TO_RIGHT_OVERRIDE = 12;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for LEFT_TO_RIGHT_OVERRIDE.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = (byte) LEFT_TO_RIGHT_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Directional type AL
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT_ARABIC = 13;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for RIGHT_TO_LEFT_ARABIC.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = (byte) RIGHT_TO_LEFT_ARABIC;
|
||||
|
||||
/**
|
||||
* Directional type RLE
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT_EMBEDDING = 14;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for RIGHT_TO_LEFT_EMBEDDING.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = (byte) RIGHT_TO_LEFT_EMBEDDING;
|
||||
|
||||
/**
|
||||
* Directional type RLO
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int RIGHT_TO_LEFT_OVERRIDE = 15;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for RIGHT_TO_LEFT_OVERRIDE.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = (byte) RIGHT_TO_LEFT_OVERRIDE;
|
||||
|
||||
/**
|
||||
* Directional type PDF
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int POP_DIRECTIONAL_FORMAT = 16;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for POP_DIRECTIONAL_FORMAT.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = (byte) POP_DIRECTIONAL_FORMAT;
|
||||
|
||||
/**
|
||||
* Directional type NSM
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int DIR_NON_SPACING_MARK = 17;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for DIR_NON_SPACING_MARK.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_NON_SPACING_MARK = (byte) DIR_NON_SPACING_MARK;
|
||||
|
||||
/**
|
||||
* Directional type BN
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int BOUNDARY_NEUTRAL = 18;
|
||||
|
||||
/**
|
||||
* JDK-compatible synonum for BOUNDARY_NEUTRAL.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = (byte) BOUNDARY_NEUTRAL;
|
||||
|
||||
/**
|
||||
* Number of directional types
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CHAR_DIRECTION_COUNT = 19;
|
||||
|
||||
/**
|
||||
* Undefined bidirectional character type. Undefined <code>char</code> values
|
||||
* have undefined directionality in the Unicode specification.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final byte DIRECTIONALITY_UNDEFINED = -1;
|
||||
}
|
||||
}
|
4729
sources/main/java/jdk_internal/icu/text/BidiBase.java
Normal file
4729
sources/main/java/jdk_internal/icu/text/BidiBase.java
Normal file
File diff suppressed because it is too large
Load Diff
821
sources/main/java/jdk_internal/icu/text/BidiLine.java
Normal file
821
sources/main/java/jdk_internal/icu/text/BidiLine.java
Normal file
@ -0,0 +1,821 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import jdk_internal.bidi.Bidi;
|
||||
|
||||
final class BidiLine {
|
||||
|
||||
/*
|
||||
* General remarks about the functions in this file:
|
||||
*
|
||||
* These functions deal with the aspects of potentially mixed-directional text
|
||||
* in a single paragraph or in a line of a single paragraph which has already
|
||||
* been processed according to the Unicode 3.0 Bidi algorithm as defined in <a
|
||||
* href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9: Unicode
|
||||
* Bidirectional Algorithm</a>, version 13, also described in The Unicode
|
||||
* Standard, Version 4.0.1 .
|
||||
*
|
||||
* This means that there is a Bidi object with a levels and a dirProps array.
|
||||
* paraLevel and direction are also set. Only if the length of the text is zero,
|
||||
* then levels==dirProps==NULL.
|
||||
*
|
||||
* The overall directionality of the paragraph or line is used to bypass the
|
||||
* reordering steps if possible. Even purely RTL text does not need reordering
|
||||
* there because the getLogical/VisualIndex() methods can compute the index on
|
||||
* the fly in such a case.
|
||||
*
|
||||
* The implementation of the access to same-level-runs and of the reordering do
|
||||
* attempt to provide better performance and less memory usage compared to a
|
||||
* direct implementation of especially rule (L2) with an array of one (32-bit)
|
||||
* integer per text character.
|
||||
*
|
||||
* Here, the levels array is scanned as soon as necessary, and a vector of
|
||||
* same-level-runs is created. Reordering then is done on this vector. For each
|
||||
* run of text positions that were resolved to the same level, only 8 bytes are
|
||||
* stored: the first text position of the run and the visual position behind the
|
||||
* run after reordering. One sign bit is used to hold the directionality of the
|
||||
* run. This is inefficient if there are many very short runs. If the average
|
||||
* run length is <2, then this uses more memory.
|
||||
*
|
||||
* In a further attempt to save memory, the levels array is never changed after
|
||||
* all the resolution rules (Xn, Wn, Nn, In). Many methods have to consider the
|
||||
* field trailingWSStart: if it is less than length, then there is an implicit
|
||||
* trailing run at the paraLevel, which is not reflected in the levels array.
|
||||
* This allows a line Bidi object to use the same levels array as its paragraph
|
||||
* parent object.
|
||||
*
|
||||
* When a Bidi object is created for a line of a paragraph, then the paragraph's
|
||||
* levels and dirProps arrays are reused by way of setting a pointer into them,
|
||||
* not by copying. This again saves memory and forbids to change the now shared
|
||||
* levels for (L1).
|
||||
*/
|
||||
|
||||
/* handle trailing WS (L1) -------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* setTrailingWSStart() sets the start index for a trailing run of WS in the
|
||||
* line. This is necessary because we do not modify the paragraph's levels array
|
||||
* that we just point into. Using trailingWSStart is another form of performing
|
||||
* (L1).
|
||||
*
|
||||
* To make subsequent operations easier, we also include the run before the WS
|
||||
* if it is at the paraLevel - we merge the two here.
|
||||
*
|
||||
* This method is called only from setLine(), so paraLevel is set correctly for
|
||||
* the line even when contextual multiple paragraphs.
|
||||
*/
|
||||
|
||||
static void setTrailingWSStart(BidiBase bidiBase) {
|
||||
byte[] dirProps = bidiBase.dirProps;
|
||||
byte[] levels = bidiBase.levels;
|
||||
int start = bidiBase.length;
|
||||
byte paraLevel = bidiBase.paraLevel;
|
||||
|
||||
/*
|
||||
* If the line is terminated by a block separator, all preceding WS etc... are
|
||||
* already set to paragraph level. Setting trailingWSStart to pBidi->length will
|
||||
* avoid changing the level of B chars from 0 to paraLevel in getLevels when
|
||||
* orderParagraphsLTR==TRUE
|
||||
*/
|
||||
if (dirProps[start - 1] == BidiBase.B) {
|
||||
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
|
||||
return;
|
||||
}
|
||||
/* go backwards across all WS, BN, explicit codes */
|
||||
while (start > 0 && (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
|
||||
--start;
|
||||
}
|
||||
|
||||
/* if the WS run can be merged with the previous run then do so here */
|
||||
while (start > 0 && levels[start - 1] == paraLevel) {
|
||||
--start;
|
||||
}
|
||||
|
||||
bidiBase.trailingWSStart = start;
|
||||
}
|
||||
|
||||
static Bidi setLine(BidiBase paraBidi, Bidi newBidi, BidiBase lineBidi, int start, int limit) {
|
||||
int length;
|
||||
|
||||
/* set the values in lineBidi from its paraBidi parent */
|
||||
/* class members are already initialized to 0 */
|
||||
// lineBidi.paraBidi = null; /* mark unfinished setLine */
|
||||
// lineBidi.flags = 0;
|
||||
// lineBidi.controlCount = 0;
|
||||
|
||||
length = lineBidi.length = lineBidi.originalLength = lineBidi.resultLength = limit - start;
|
||||
|
||||
lineBidi.text = new char[length];
|
||||
System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length);
|
||||
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
|
||||
lineBidi.paraCount = paraBidi.paraCount;
|
||||
lineBidi.runs = new BidiRun[0];
|
||||
lineBidi.reorderingMode = paraBidi.reorderingMode;
|
||||
lineBidi.reorderingOptions = paraBidi.reorderingOptions;
|
||||
if (paraBidi.controlCount > 0) {
|
||||
int j;
|
||||
for (j = start; j < limit; j++) {
|
||||
if (BidiBase.IsBidiControlChar(paraBidi.text[j])) {
|
||||
lineBidi.controlCount++;
|
||||
}
|
||||
}
|
||||
lineBidi.resultLength -= lineBidi.controlCount;
|
||||
}
|
||||
/* copy proper subset of DirProps */
|
||||
lineBidi.getDirPropsMemory(length);
|
||||
lineBidi.dirProps = lineBidi.dirPropsMemory;
|
||||
System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0, length);
|
||||
/* copy proper subset of Levels */
|
||||
lineBidi.getLevelsMemory(length);
|
||||
lineBidi.levels = lineBidi.levelsMemory;
|
||||
System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0, length);
|
||||
lineBidi.runCount = -1;
|
||||
|
||||
if (paraBidi.direction != BidiBase.MIXED) {
|
||||
/* the parent is already trivial */
|
||||
lineBidi.direction = paraBidi.direction;
|
||||
|
||||
/*
|
||||
* The parent's levels are all either implicitly or explicitly ==paraLevel; do
|
||||
* the same here.
|
||||
*/
|
||||
if (paraBidi.trailingWSStart <= start) {
|
||||
lineBidi.trailingWSStart = 0;
|
||||
} else if (paraBidi.trailingWSStart < limit) {
|
||||
lineBidi.trailingWSStart = paraBidi.trailingWSStart - start;
|
||||
} else {
|
||||
lineBidi.trailingWSStart = length;
|
||||
}
|
||||
} else {
|
||||
byte[] levels = lineBidi.levels;
|
||||
int i, trailingWSStart;
|
||||
byte level;
|
||||
|
||||
setTrailingWSStart(lineBidi);
|
||||
trailingWSStart = lineBidi.trailingWSStart;
|
||||
|
||||
/* recalculate lineBidiBase.direction */
|
||||
if (trailingWSStart == 0) {
|
||||
/* all levels are at paraLevel */
|
||||
lineBidi.direction = (byte) (lineBidi.paraLevel & 1);
|
||||
} else {
|
||||
/* get the level of the first character */
|
||||
level = (byte) (levels[0] & 1);
|
||||
|
||||
/*
|
||||
* if there is anything of a different level, then the line is mixed
|
||||
*/
|
||||
if (trailingWSStart < length && (lineBidi.paraLevel & 1) != level) {
|
||||
/*
|
||||
* the trailing WS is at paraLevel, which differs from levels[0]
|
||||
*/
|
||||
lineBidi.direction = BidiBase.MIXED;
|
||||
} else {
|
||||
/*
|
||||
* see if levels[1..trailingWSStart-1] have the same direction as levels[0] and
|
||||
* paraLevel
|
||||
*/
|
||||
for (i = 1;; i++) {
|
||||
if (i == trailingWSStart) {
|
||||
/* the direction values match those in level */
|
||||
lineBidi.direction = level;
|
||||
break;
|
||||
} else if ((levels[i] & 1) != level) {
|
||||
lineBidi.direction = BidiBase.MIXED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (lineBidi.direction) {
|
||||
case Bidi.DIRECTION_LEFT_TO_RIGHT:
|
||||
/* make sure paraLevel is even */
|
||||
lineBidi.paraLevel = (byte) ((lineBidi.paraLevel + 1) & ~1);
|
||||
|
||||
/*
|
||||
* all levels are implicitly at paraLevel (important for getLevels())
|
||||
*/
|
||||
lineBidi.trailingWSStart = 0;
|
||||
break;
|
||||
case Bidi.DIRECTION_RIGHT_TO_LEFT:
|
||||
/* make sure paraLevel is odd */
|
||||
lineBidi.paraLevel |= 1;
|
||||
|
||||
/*
|
||||
* all levels are implicitly at paraLevel (important for getLevels())
|
||||
*/
|
||||
lineBidi.trailingWSStart = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lineBidi.paraBidi = paraBidi; /* mark successful setLine */
|
||||
|
||||
return newBidi;
|
||||
}
|
||||
|
||||
static byte getLevelAt(BidiBase bidiBase, int charIndex) {
|
||||
/* return paraLevel if in the trailing WS run, otherwise the real level */
|
||||
if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) {
|
||||
return bidiBase.GetParaLevelAt(charIndex);
|
||||
} else {
|
||||
return bidiBase.levels[charIndex];
|
||||
}
|
||||
}
|
||||
|
||||
static byte[] getLevels(BidiBase bidiBase) {
|
||||
int start = bidiBase.trailingWSStart;
|
||||
int length = bidiBase.length;
|
||||
|
||||
if (start != length) {
|
||||
/* the current levels array does not reflect the WS run */
|
||||
/*
|
||||
* After the previous if(), we know that the levels array has an implicit
|
||||
* trailing WS run and therefore does not fully reflect itself all the levels.
|
||||
* This must be a Bidi object for a line, and we need to create a new levels
|
||||
* array.
|
||||
*/
|
||||
/*
|
||||
* bidiBase.paraLevel is ok even if contextual multiple paragraphs, since
|
||||
* bidiBase is a line object
|
||||
*/
|
||||
Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel);
|
||||
|
||||
/* this new levels array is set for the line and reflects the WS run */
|
||||
bidiBase.trailingWSStart = length;
|
||||
}
|
||||
if (length < bidiBase.levels.length) {
|
||||
byte[] levels = new byte[length];
|
||||
System.arraycopy(bidiBase.levels, 0, levels, 0, length);
|
||||
return levels;
|
||||
}
|
||||
return bidiBase.levels;
|
||||
}
|
||||
|
||||
static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
|
||||
int start = bidiBase.runs[runIndex].start;
|
||||
int limit;
|
||||
byte level = bidiBase.runs[runIndex].level;
|
||||
|
||||
if (runIndex > 0) {
|
||||
limit = start + bidiBase.runs[runIndex].limit - bidiBase.runs[runIndex - 1].limit;
|
||||
} else {
|
||||
limit = start + bidiBase.runs[0].limit;
|
||||
}
|
||||
return new BidiRun(start, limit, level);
|
||||
}
|
||||
|
||||
/* in trivial cases there is only one trivial run; called by getRuns() */
|
||||
private static void getSingleRun(BidiBase bidiBase, byte level) {
|
||||
/* simple, single-run case */
|
||||
bidiBase.runs = bidiBase.simpleRuns;
|
||||
bidiBase.runCount = 1;
|
||||
|
||||
/* fill and reorder the single run */
|
||||
bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level);
|
||||
}
|
||||
|
||||
/* reorder the runs array (L2) ---------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Reorder the same-level runs in the runs array. Here, runCount>1 and
|
||||
* maxLevel>=minLevel>=paraLevel. All the visualStart fields=logical start
|
||||
* before reordering. The "odd" bits are not set yet.
|
||||
*
|
||||
* Reordering with this data structure lends itself to some handy shortcuts:
|
||||
*
|
||||
* Since each run is moved but not modified, and since at the initial maxLevel
|
||||
* each sequence of same-level runs consists of only one run each, we don't need
|
||||
* to do anything there and can predecrement maxLevel. In many simple cases, the
|
||||
* reordering is thus done entirely in the index mapping. Also, reordering
|
||||
* occurs only down to the lowest odd level that occurs, which is minLevel|1.
|
||||
* However, if the lowest level itself is odd, then in the last reordering the
|
||||
* sequence of the runs at this level or higher will be all runs, and we don't
|
||||
* need the elaborate loop to search for them. This is covered by ++minLevel
|
||||
* instead of minLevel|=1 followed by an extra reorder-all after the
|
||||
* reorder-some loop. About a trailing WS run: Such a run would need special
|
||||
* treatment because its level is not reflected in levels[] if this is not a
|
||||
* paragraph object. Instead, all characters from trailingWSStart on are
|
||||
* implicitly at paraLevel. However, for all maxLevel>paraLevel, this run will
|
||||
* never be reordered and does not need to be taken into account.
|
||||
* maxLevel==paraLevel is only reordered if minLevel==paraLevel is odd, which is
|
||||
* done in the extra segment. This means that for the main reordering loop we
|
||||
* don't need to consider this run and can --runCount. If it is later part of
|
||||
* the all-runs reordering, then runCount is adjusted accordingly.
|
||||
*/
|
||||
private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) {
|
||||
|
||||
/* nothing to do? */
|
||||
if (maxLevel <= (minLevel | 1)) {
|
||||
return;
|
||||
}
|
||||
|
||||
BidiRun[] runs;
|
||||
BidiRun tempRun;
|
||||
byte[] levels;
|
||||
int firstRun, endRun, limitRun, runCount;
|
||||
|
||||
/*
|
||||
* Reorder only down to the lowest odd level and reorder at an odd minLevel in a
|
||||
* separate, simpler loop. See comments above for why minLevel is always
|
||||
* incremented.
|
||||
*/
|
||||
++minLevel;
|
||||
|
||||
runs = bidiBase.runs;
|
||||
levels = bidiBase.levels;
|
||||
runCount = bidiBase.runCount;
|
||||
|
||||
/*
|
||||
* do not include the WS run at paraLevel<=old minLevel except in the simple
|
||||
* loop
|
||||
*/
|
||||
if (bidiBase.trailingWSStart < bidiBase.length) {
|
||||
--runCount;
|
||||
}
|
||||
|
||||
while (--maxLevel >= minLevel) {
|
||||
firstRun = 0;
|
||||
|
||||
/* loop for all sequences of runs */
|
||||
for (;;) {
|
||||
/* look for a sequence of runs that are all at >=maxLevel */
|
||||
/* look for the first run of such a sequence */
|
||||
while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) {
|
||||
++firstRun;
|
||||
}
|
||||
if (firstRun >= runCount) {
|
||||
break; /* no more such runs */
|
||||
}
|
||||
|
||||
/* look for the limit run of such a sequence (the run behind it) */
|
||||
for (limitRun = firstRun; ++limitRun < runCount && levels[runs[limitRun].start] >= maxLevel;) {
|
||||
}
|
||||
|
||||
/* Swap the entire sequence of runs from firstRun to limitRun-1. */
|
||||
endRun = limitRun - 1;
|
||||
while (firstRun < endRun) {
|
||||
tempRun = runs[firstRun];
|
||||
runs[firstRun] = runs[endRun];
|
||||
runs[endRun] = tempRun;
|
||||
++firstRun;
|
||||
--endRun;
|
||||
}
|
||||
|
||||
if (limitRun == runCount) {
|
||||
break; /* no more such runs */
|
||||
} else {
|
||||
firstRun = limitRun + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* now do maxLevel==old minLevel (==odd!), see above */
|
||||
if ((minLevel & 1) == 0) {
|
||||
firstRun = 0;
|
||||
|
||||
/* include the trailing WS run in this complete reordering */
|
||||
if (bidiBase.trailingWSStart == bidiBase.length) {
|
||||
--runCount;
|
||||
}
|
||||
|
||||
/* Swap the entire sequence of all runs. (endRun==runCount) */
|
||||
while (firstRun < runCount) {
|
||||
tempRun = runs[firstRun];
|
||||
runs[firstRun] = runs[runCount];
|
||||
runs[runCount] = tempRun;
|
||||
++firstRun;
|
||||
--runCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the runs array --------------------------------------------------- */
|
||||
|
||||
static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) {
|
||||
BidiRun[] runs = bidiBase.runs;
|
||||
int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart;
|
||||
|
||||
for (i = 0; i < runCount; i++) {
|
||||
length = runs[i].limit - visualStart;
|
||||
logicalStart = runs[i].start;
|
||||
if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart + length))) {
|
||||
return i;
|
||||
}
|
||||
visualStart += length;
|
||||
}
|
||||
/* we should never get here */
|
||||
throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex");
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the runs array from the levels array. After getRuns() returns true,
|
||||
* runCount is guaranteed to be >0 and the runs are reordered. Odd-level runs
|
||||
* have visualStart on their visual right edge and they progress visually to the
|
||||
* left. If option OPTION_INSERT_MARKS is set, insertRemove will contain the sum
|
||||
* of appropriate LRM/RLM_BEFORE/AFTER flags. If option OPTION_REMOVE_CONTROLS
|
||||
* is set, insertRemove will contain the negative number of BiDi control
|
||||
* characters within this run.
|
||||
*/
|
||||
static void getRuns(BidiBase bidiBase) {
|
||||
/*
|
||||
* This method returns immediately if the runs are already set. This includes
|
||||
* the case of length==0 (handled in setPara)..
|
||||
*/
|
||||
if (bidiBase.runCount >= 0) {
|
||||
return;
|
||||
}
|
||||
if (bidiBase.direction != BidiBase.MIXED) {
|
||||
/* simple, single-run case - this covers length==0 */
|
||||
/* bidiBase.paraLevel is ok even for contextual multiple paragraphs */
|
||||
getSingleRun(bidiBase, bidiBase.paraLevel);
|
||||
} else /* BidiBase.MIXED, length>0 */ {
|
||||
/* mixed directionality */
|
||||
int length = bidiBase.length, limit;
|
||||
byte[] levels = bidiBase.levels;
|
||||
int i, runCount;
|
||||
byte level = -1; /* initialize with no valid level */
|
||||
/*
|
||||
* If there are WS characters at the end of the line and the run preceding them
|
||||
* has a level different from paraLevel, then they will form their own run at
|
||||
* paraLevel (L1). Count them separately. We need some special treatment for
|
||||
* this in order to not modify the levels array which a line Bidi object shares
|
||||
* with its paragraph parent and its other line siblings. In other words, for
|
||||
* the trailing WS, it may be levels[]!=paraLevel but we have to treat it like
|
||||
* it were so.
|
||||
*/
|
||||
limit = bidiBase.trailingWSStart;
|
||||
/* count the runs, there is at least one non-WS run, and limit>0 */
|
||||
runCount = 0;
|
||||
for (i = 0; i < limit; ++i) {
|
||||
/* increment runCount at the start of each run */
|
||||
if (levels[i] != level) {
|
||||
++runCount;
|
||||
level = levels[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't need to see if the last run can be merged with a trailing WS run
|
||||
* because setTrailingWSStart() would have done that.
|
||||
*/
|
||||
if (runCount == 1 && limit == length) {
|
||||
/* There is only one non-WS run and no trailing WS-run. */
|
||||
getSingleRun(bidiBase, levels[0]);
|
||||
} else /* runCount>1 || limit<length */ {
|
||||
/* allocate and set the runs */
|
||||
BidiRun[] runs;
|
||||
int runIndex, start;
|
||||
byte minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
|
||||
byte maxLevel = 0;
|
||||
|
||||
/* now, count a (non-mergeable) WS run */
|
||||
if (limit < length) {
|
||||
++runCount;
|
||||
}
|
||||
|
||||
/* runCount > 1 */
|
||||
bidiBase.getRunsMemory(runCount);
|
||||
runs = bidiBase.runsMemory;
|
||||
|
||||
/* set the runs */
|
||||
/*
|
||||
* FOOD FOR THOUGHT: this could be optimized, e.g.: 464->444, 484->444,
|
||||
* 575->555, 595->555 However, that would take longer. Check also how it would
|
||||
* interact with BiDi control removal and inserting Marks.
|
||||
*/
|
||||
runIndex = 0;
|
||||
|
||||
/*
|
||||
* search for the run limits and initialize visualLimit values with the run
|
||||
* lengths
|
||||
*/
|
||||
i = 0;
|
||||
do {
|
||||
/* prepare this run */
|
||||
start = i;
|
||||
level = levels[i];
|
||||
if (level < minLevel) {
|
||||
minLevel = level;
|
||||
}
|
||||
if (level > maxLevel) {
|
||||
maxLevel = level;
|
||||
}
|
||||
|
||||
/* look for the run limit */
|
||||
while (++i < limit && levels[i] == level) {
|
||||
}
|
||||
|
||||
/* i is another run limit */
|
||||
runs[runIndex] = new BidiRun(start, i - start, level);
|
||||
++runIndex;
|
||||
} while (i < limit);
|
||||
|
||||
if (limit < length) {
|
||||
/* there is a separate WS run */
|
||||
runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel);
|
||||
/*
|
||||
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
|
||||
* paragraphs.
|
||||
*/
|
||||
if (bidiBase.paraLevel < minLevel) {
|
||||
minLevel = bidiBase.paraLevel;
|
||||
}
|
||||
}
|
||||
|
||||
/* set the object fields */
|
||||
bidiBase.runs = runs;
|
||||
bidiBase.runCount = runCount;
|
||||
|
||||
reorderLine(bidiBase, minLevel, maxLevel);
|
||||
|
||||
/* now add the direction flags and adjust the visualLimit's to be just that */
|
||||
/* this loop will also handle the trailing WS run */
|
||||
limit = 0;
|
||||
for (i = 0; i < runCount; ++i) {
|
||||
runs[i].level = levels[runs[i].start];
|
||||
limit = (runs[i].limit += limit);
|
||||
}
|
||||
|
||||
/* Set the embedding level for the trailing WS run. */
|
||||
/* For a RTL paragraph, it will be the *first* run in visual order. */
|
||||
/*
|
||||
* For the trailing WS run, bidiBase.paraLevel is ok even if contextual multiple
|
||||
* paragraphs.
|
||||
*/
|
||||
if (runIndex < runCount) {
|
||||
int trailingRun = ((bidiBase.paraLevel & 1) != 0) ? 0 : runIndex;
|
||||
runs[trailingRun].level = bidiBase.paraLevel;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* handle insert LRM/RLM BEFORE/AFTER run */
|
||||
if (bidiBase.insertPoints.size > 0) {
|
||||
BidiBase.Point point;
|
||||
int runIndex, ip;
|
||||
for (ip = 0; ip < bidiBase.insertPoints.size; ip++) {
|
||||
point = bidiBase.insertPoints.points[ip];
|
||||
runIndex = getRunFromLogicalIndex(bidiBase, point.pos);
|
||||
bidiBase.runs[runIndex].insertRemove |= point.flag;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle remove BiDi control characters */
|
||||
if (bidiBase.controlCount > 0) {
|
||||
int runIndex, ic;
|
||||
char c;
|
||||
for (ic = 0; ic < bidiBase.length; ic++) {
|
||||
c = bidiBase.text[ic];
|
||||
if (BidiBase.IsBidiControlChar(c)) {
|
||||
runIndex = getRunFromLogicalIndex(bidiBase, ic);
|
||||
bidiBase.runs[runIndex].insertRemove--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel) {
|
||||
int start;
|
||||
byte level, minLevel, maxLevel;
|
||||
|
||||
if (levels == null || levels.length <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/* determine minLevel and maxLevel */
|
||||
minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
|
||||
maxLevel = 0;
|
||||
for (start = levels.length; start > 0;) {
|
||||
level = levels[--start];
|
||||
if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
|
||||
return null;
|
||||
}
|
||||
if (level < minLevel) {
|
||||
minLevel = level;
|
||||
}
|
||||
if (level > maxLevel) {
|
||||
maxLevel = level;
|
||||
}
|
||||
}
|
||||
pMinLevel[0] = minLevel;
|
||||
pMaxLevel[0] = maxLevel;
|
||||
|
||||
/* initialize the index map */
|
||||
int[] indexMap = new int[levels.length];
|
||||
for (start = levels.length; start > 0;) {
|
||||
--start;
|
||||
indexMap[start] = start;
|
||||
}
|
||||
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
static int[] reorderVisual(byte[] levels) {
|
||||
byte[] aMinLevel = new byte[1];
|
||||
byte[] aMaxLevel = new byte[1];
|
||||
int start, end, limit, temp;
|
||||
byte minLevel, maxLevel;
|
||||
|
||||
int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel);
|
||||
if (indexMap == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
minLevel = aMinLevel[0];
|
||||
maxLevel = aMaxLevel[0];
|
||||
|
||||
/* nothing to do? */
|
||||
if (minLevel == maxLevel && (minLevel & 1) == 0) {
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
/* reorder only down to the lowest odd level */
|
||||
minLevel |= 1;
|
||||
|
||||
/* loop maxLevel..minLevel */
|
||||
do {
|
||||
start = 0;
|
||||
|
||||
/* loop for all sequences of levels to reorder at the current maxLevel */
|
||||
for (;;) {
|
||||
/* look for a sequence of levels that are all at >=maxLevel */
|
||||
/* look for the first index of such a sequence */
|
||||
while (start < levels.length && levels[start] < maxLevel) {
|
||||
++start;
|
||||
}
|
||||
if (start >= levels.length) {
|
||||
break; /* no more such runs */
|
||||
}
|
||||
|
||||
/* look for the limit of such a sequence (the index behind it) */
|
||||
for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel;) {
|
||||
}
|
||||
|
||||
/*
|
||||
* Swap the entire interval of indexes from start to limit-1. We don't need to
|
||||
* swap the levels for the purpose of this algorithm: the sequence of levels
|
||||
* that we look at does not move anyway.
|
||||
*/
|
||||
end = limit - 1;
|
||||
while (start < end) {
|
||||
temp = indexMap[start];
|
||||
indexMap[start] = indexMap[end];
|
||||
indexMap[end] = temp;
|
||||
|
||||
++start;
|
||||
--end;
|
||||
}
|
||||
|
||||
if (limit == levels.length) {
|
||||
break; /* no more such sequences */
|
||||
} else {
|
||||
start = limit + 1;
|
||||
}
|
||||
}
|
||||
} while (--maxLevel >= minLevel);
|
||||
|
||||
return indexMap;
|
||||
}
|
||||
|
||||
static int[] getVisualMap(BidiBase bidiBase) {
|
||||
/* fill a visual-to-logical index map using the runs[] */
|
||||
BidiRun[] runs = bidiBase.runs;
|
||||
int logicalStart, visualStart, visualLimit;
|
||||
int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length : bidiBase.resultLength;
|
||||
int[] indexMap = new int[allocLength];
|
||||
|
||||
visualStart = 0;
|
||||
int idx = 0;
|
||||
for (int j = 0; j < bidiBase.runCount; ++j) {
|
||||
logicalStart = runs[j].start;
|
||||
visualLimit = runs[j].limit;
|
||||
if (runs[j].isEvenRun()) {
|
||||
do { /* LTR */
|
||||
indexMap[idx++] = logicalStart++;
|
||||
} while (++visualStart < visualLimit);
|
||||
} else {
|
||||
logicalStart += visualLimit - visualStart; /* logicalLimit */
|
||||
do { /* RTL */
|
||||
indexMap[idx++] = --logicalStart;
|
||||
} while (++visualStart < visualLimit);
|
||||
}
|
||||
/* visualStart==visualLimit; */
|
||||
}
|
||||
|
||||
if (bidiBase.insertPoints.size > 0) {
|
||||
int markFound = 0, runCount = bidiBase.runCount;
|
||||
int insertRemove, i, j, k;
|
||||
runs = bidiBase.runs;
|
||||
/* count all inserted marks */
|
||||
for (i = 0; i < runCount; i++) {
|
||||
insertRemove = runs[i].insertRemove;
|
||||
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
|
||||
markFound++;
|
||||
}
|
||||
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
|
||||
markFound++;
|
||||
}
|
||||
}
|
||||
/* move back indexes by number of preceding marks */
|
||||
k = bidiBase.resultLength;
|
||||
for (i = runCount - 1; i >= 0 && markFound > 0; i--) {
|
||||
insertRemove = runs[i].insertRemove;
|
||||
if ((insertRemove & (BidiBase.LRM_AFTER | BidiBase.RLM_AFTER)) > 0) {
|
||||
indexMap[--k] = BidiBase.MAP_NOWHERE;
|
||||
markFound--;
|
||||
}
|
||||
visualStart = i > 0 ? runs[i - 1].limit : 0;
|
||||
for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) {
|
||||
indexMap[--k] = indexMap[j];
|
||||
}
|
||||
if ((insertRemove & (BidiBase.LRM_BEFORE | BidiBase.RLM_BEFORE)) > 0) {
|
||||
indexMap[--k] = BidiBase.MAP_NOWHERE;
|
||||
markFound--;
|
||||
}
|
||||
}
|
||||
} else if (bidiBase.controlCount > 0) {
|
||||
int runCount = bidiBase.runCount, logicalEnd;
|
||||
int insertRemove, length, i, j, k, m;
|
||||
char uchar;
|
||||
boolean evenRun;
|
||||
runs = bidiBase.runs;
|
||||
visualStart = 0;
|
||||
/* move forward indexes by number of preceding controls */
|
||||
k = 0;
|
||||
for (i = 0; i < runCount; i++, visualStart += length) {
|
||||
length = runs[i].limit - visualStart;
|
||||
insertRemove = runs[i].insertRemove;
|
||||
/* if no control found yet, nothing to do in this run */
|
||||
if ((insertRemove == 0) && (k == visualStart)) {
|
||||
k += length;
|
||||
continue;
|
||||
}
|
||||
/* if no control in this run */
|
||||
if (insertRemove == 0) {
|
||||
visualLimit = runs[i].limit;
|
||||
for (j = visualStart; j < visualLimit; j++) {
|
||||
indexMap[k++] = indexMap[j];
|
||||
}
|
||||
continue;
|
||||
}
|
||||
logicalStart = runs[i].start;
|
||||
evenRun = runs[i].isEvenRun();
|
||||
logicalEnd = logicalStart + length - 1;
|
||||
for (j = 0; j < length; j++) {
|
||||
m = evenRun ? logicalStart + j : logicalEnd - j;
|
||||
uchar = bidiBase.text[m];
|
||||
if (!BidiBase.IsBidiControlChar(uchar)) {
|
||||
indexMap[k++] = m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allocLength == bidiBase.resultLength) {
|
||||
return indexMap;
|
||||
}
|
||||
int[] newMap = new int[bidiBase.resultLength];
|
||||
System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength);
|
||||
return newMap;
|
||||
}
|
||||
|
||||
}
|
123
sources/main/java/jdk_internal/icu/text/BidiRun.java
Normal file
123
sources/main/java/jdk_internal/icu/text/BidiRun.java
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* A BidiRun represents a sequence of characters at the same embedding level.
|
||||
* The Bidi algorithm decomposes a piece of text into sequences of characters at
|
||||
* the same embedding level, each such sequence is called a "run".
|
||||
*
|
||||
* <p>
|
||||
* A BidiRun represents such a run by storing its essential properties, but does
|
||||
* not duplicate the characters which form the run.
|
||||
*
|
||||
* <p>
|
||||
* The "limit" of the run is the position just after the last
|
||||
* character, i.e., one more than that position.
|
||||
*
|
||||
* <p>
|
||||
* This class has no public constructor, and its members cannot be modified by
|
||||
* users.
|
||||
*
|
||||
* @see com.ibm.icu.text.Bidi
|
||||
*/
|
||||
class BidiRun {
|
||||
|
||||
int start; /* first logical position of the run */
|
||||
int limit; /* last visual position of the run +1 */
|
||||
int insertRemove; /*
|
||||
* if >0, flags for inserting LRM/RLM before/after run, if <0, count of bidi
|
||||
* controls within run
|
||||
*/
|
||||
byte level;
|
||||
|
||||
/*
|
||||
* Default constructor
|
||||
*
|
||||
* Note that members start and limit of a run instance have different meanings
|
||||
* depending whether the run is part of the runs array of a Bidi object, or if
|
||||
* it is a reference returned by getVisualRun() or getLogicalRun(). For a member
|
||||
* of the runs array of a Bidi object, - start is the first logical position of
|
||||
* the run in the source text. - limit is one after the last visual position of
|
||||
* the run. For a reference returned by getLogicalRun() or getVisualRun(), -
|
||||
* start is the first logical position of the run in the source text. - limit is
|
||||
* one after the last logical position of the run.
|
||||
*/
|
||||
BidiRun() {
|
||||
this(0, 0, (byte) 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructor
|
||||
*/
|
||||
BidiRun(int start, int limit, byte embeddingLevel) {
|
||||
this.start = start;
|
||||
this.limit = limit;
|
||||
this.level = embeddingLevel;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the content of a BidiRun instance
|
||||
*/
|
||||
void copyFrom(BidiRun run) {
|
||||
this.start = run.start;
|
||||
this.limit = run.limit;
|
||||
this.level = run.level;
|
||||
this.insertRemove = run.insertRemove;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get level of run
|
||||
*/
|
||||
byte getEmbeddingLevel() {
|
||||
return level;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if run level is even
|
||||
*
|
||||
* @return true if the embedding level of this run is even, i.e. it is a
|
||||
* left-to-right run.
|
||||
*/
|
||||
boolean isEvenRun() {
|
||||
return (level & 1) == 0;
|
||||
}
|
||||
|
||||
}
|
425
sources/main/java/jdk_internal/icu/text/BidiWriter.java
Normal file
425
sources/main/java/jdk_internal/icu/text/BidiWriter.java
Normal file
@ -0,0 +1,425 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
/* Written by Simon Montagu, Matitiahu Allouche
|
||||
* (ported from C code written by Markus W. Scherer)
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
|
||||
final class BidiWriter {
|
||||
|
||||
/** Bidi control code points */
|
||||
static final char LRM_CHAR = 0x200e;
|
||||
static final char RLM_CHAR = 0x200f;
|
||||
static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | 1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
|
||||
|
||||
private static boolean IsCombining(int type) {
|
||||
return ((1 << type & (1 << UCharacter.NON_SPACING_MARK | 1 << UCharacter.COMBINING_SPACING_MARK
|
||||
| 1 << UCharacter.ENCLOSING_MARK)) != 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* When we have OUTPUT_REVERSE set on writeReordered(), then we semantically
|
||||
* write RTL runs in reverse and later reverse them again. Instead, we actually
|
||||
* write them in forward order to begin with. However, if the RTL run was to be
|
||||
* mirrored, we need to mirror here now since the implicit second reversal must
|
||||
* not do it. It looks strange to do mirroring in LTR output, but it is only
|
||||
* because we are writing RTL output in reverse.
|
||||
*/
|
||||
private static String doWriteForward(String src, int options) {
|
||||
/* optimize for several combinations of options */
|
||||
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING)) {
|
||||
case 0: {
|
||||
/* simply return the LTR run */
|
||||
return src;
|
||||
}
|
||||
case BidiBase.DO_MIRRORING: {
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* do mirroring */
|
||||
int i = 0;
|
||||
int c;
|
||||
|
||||
do {
|
||||
c = UTF16.charAt(src, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
UTF16.append(dest, UCharacter.getMirror(c));
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
case BidiBase.REMOVE_BIDI_CONTROLS: {
|
||||
StringBuilder dest = new StringBuilder(src.length());
|
||||
|
||||
/* copy the LTR run and remove any Bidi control characters */
|
||||
int i = 0;
|
||||
char c;
|
||||
do {
|
||||
c = src.charAt(i++);
|
||||
if (!BidiBase.IsBidiControlChar(c)) {
|
||||
dest.append(c);
|
||||
}
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
default: {
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* remove Bidi control characters and do mirroring */
|
||||
int i = 0;
|
||||
int c;
|
||||
do {
|
||||
c = UTF16.charAt(src, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (!BidiBase.IsBidiControlChar(c)) {
|
||||
UTF16.append(dest, UCharacter.getMirror(c));
|
||||
}
|
||||
} while (i < src.length());
|
||||
return dest.toString();
|
||||
}
|
||||
} /* end of switch */
|
||||
}
|
||||
|
||||
private static String doWriteForward(char[] text, int start, int limit, int options) {
|
||||
return doWriteForward(new String(text, start, limit - start), options);
|
||||
}
|
||||
|
||||
static String writeReverse(String src, int options) {
|
||||
/*
|
||||
* RTL run -
|
||||
*
|
||||
* RTL runs need to be copied to the destination in reverse order of code
|
||||
* points, not code units, to keep Unicode characters intact.
|
||||
*
|
||||
* The general strategy for this is to read the source text in backward order,
|
||||
* collect all code units for a code point (and optionally following combining
|
||||
* characters, see below), and copy all these code units in ascending order to
|
||||
* the destination for this run.
|
||||
*
|
||||
* Several options request whether combining characters should be kept after
|
||||
* their base characters, whether Bidi control characters should be removed, and
|
||||
* whether characters should be replaced by their mirror-image equivalent
|
||||
* Unicode characters.
|
||||
*/
|
||||
StringBuffer dest = new StringBuffer(src.length());
|
||||
|
||||
/* optimize for several combinations of options */
|
||||
switch (options & (BidiBase.REMOVE_BIDI_CONTROLS | BidiBase.DO_MIRRORING | BidiBase.KEEP_BASE_COMBINING)) {
|
||||
|
||||
case 0:
|
||||
/*
|
||||
* With none of the "complicated" options set, the destination run will have the
|
||||
* same length as the source run, and there is no mirroring and no keeping
|
||||
* combining characters with their base characters.
|
||||
*
|
||||
* XXX: or dest = UTF16.reverse(new StringBuffer(src));
|
||||
*/
|
||||
|
||||
int srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int i = srcLength;
|
||||
|
||||
/* collect code units for one base character */
|
||||
srcLength -= UTF16.getCharCount(UTF16.charAt(src, srcLength - 1));
|
||||
|
||||
/* copy this base character */
|
||||
dest.append(src.substring(srcLength, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
|
||||
case BidiBase.KEEP_BASE_COMBINING:
|
||||
/*
|
||||
* Here, too, the destination run will have the same length as the source run,
|
||||
* and there is no mirroring. We do need to keep combining characters with their
|
||||
* base characters.
|
||||
*/
|
||||
srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int c;
|
||||
int i = srcLength;
|
||||
|
||||
/*
|
||||
* collect code units and modifier letters for one base character
|
||||
*/
|
||||
do {
|
||||
c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
} while (srcLength > 0 && IsCombining(UCharacter.getType(c)));
|
||||
|
||||
/* copy this "user character" */
|
||||
dest.append(src.substring(srcLength, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* With several "complicated" options set, this is the most general and the
|
||||
* slowest copying of an RTL run. We will do mirroring, remove Bidi controls,
|
||||
* and keep combining characters with their base characters as requested.
|
||||
*/
|
||||
srcLength = src.length();
|
||||
|
||||
/* preserve character integrity */
|
||||
do {
|
||||
/*
|
||||
* i is always after the last code unit known to need to be kept in this segment
|
||||
*/
|
||||
int i = srcLength;
|
||||
|
||||
/* collect code units for one base character */
|
||||
int c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
|
||||
/* collect modifier letters for this base character */
|
||||
while (srcLength > 0 && IsCombining(UCharacter.getType(c))) {
|
||||
c = UTF16.charAt(src, srcLength - 1);
|
||||
srcLength -= UTF16.getCharCount(c);
|
||||
}
|
||||
}
|
||||
|
||||
if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && BidiBase.IsBidiControlChar(c)) {
|
||||
/* do not copy this Bidi control character */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* copy this "user character" */
|
||||
int j = srcLength;
|
||||
if ((options & BidiBase.DO_MIRRORING) != 0) {
|
||||
/* mirror only the base character */
|
||||
c = UCharacter.getMirror(c);
|
||||
UTF16.append(dest, c);
|
||||
j += UTF16.getCharCount(c);
|
||||
}
|
||||
dest.append(src.substring(j, i));
|
||||
} while (srcLength > 0);
|
||||
break;
|
||||
} /* end of switch */
|
||||
|
||||
return dest.toString();
|
||||
}
|
||||
|
||||
static String doWriteReverse(char[] text, int start, int limit, int options) {
|
||||
return writeReverse(new String(text, start, limit - start), options);
|
||||
}
|
||||
|
||||
static String writeReordered(BidiBase bidi, int options) {
|
||||
int run, runCount;
|
||||
StringBuilder dest;
|
||||
char[] text = bidi.text;
|
||||
runCount = bidi.countRuns();
|
||||
|
||||
/*
|
||||
* Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
|
||||
* reordering mode (checked below) is appropriate.
|
||||
*/
|
||||
if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
|
||||
options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
|
||||
}
|
||||
/*
|
||||
* Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS and cancels
|
||||
* BidiBase.INSERT_LRM_FOR_NUMERIC.
|
||||
*/
|
||||
if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
|
||||
options |= BidiBase.REMOVE_BIDI_CONTROLS;
|
||||
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
}
|
||||
/*
|
||||
* If we do not perform the "inverse Bidi" algorithm, then we don't need to
|
||||
* insert any LRMs, and don't need to test for it.
|
||||
*/
|
||||
if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL)
|
||||
&& (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
|
||||
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
|
||||
}
|
||||
dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? bidi.length * 2 : bidi.length);
|
||||
/*
|
||||
* Iterate through all visual runs and copy the run text segments to the
|
||||
* destination, according to the options.
|
||||
*
|
||||
* The tests for where to insert LRMs ignore the fact that there may be BN codes
|
||||
* or non-BMP code points at the beginning and end of a run; they may insert
|
||||
* LRMs unnecessarily but the tests are faster this way (this would have to be
|
||||
* improved for UTF-8).
|
||||
*/
|
||||
if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
|
||||
/* forward output */
|
||||
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
|
||||
/* do not insert Bidi controls */
|
||||
for (run = 0; run < runCount; ++run) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
dest.append(
|
||||
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
} else {
|
||||
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* insert Bidi controls for "inverse Bidi" */
|
||||
byte[] dirProps = bidi.dirProps;
|
||||
char uc;
|
||||
int markFlag;
|
||||
|
||||
for (run = 0; run < runCount; ++run) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
markFlag = 0;
|
||||
/* check if something relevant in insertPoints */
|
||||
markFlag = bidi.runs[run].insertRemove;
|
||||
if (markFlag < 0) { /* bidi controls count */
|
||||
markFlag = 0;
|
||||
}
|
||||
if (bidiRun.isEvenRun()) {
|
||||
if (bidi.isInverse() && dirProps[bidiRun.start] != BidiBase.L) {
|
||||
markFlag |= BidiBase.LRM_BEFORE;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
dest.append(
|
||||
doWriteForward(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
|
||||
if (bidi.isInverse() && dirProps[bidiRun.limit - 1] != BidiBase.L) {
|
||||
markFlag |= BidiBase.LRM_AFTER;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
} else { /* RTL run */
|
||||
if (bidi.isInverse() && !bidi.testDirPropFlagAt(MASK_R_AL, bidiRun.limit - 1)) {
|
||||
markFlag |= BidiBase.RLM_BEFORE;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
dest.append(doWriteReverse(text, bidiRun.start, bidiRun.limit, options));
|
||||
|
||||
if (bidi.isInverse() && (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
|
||||
markFlag |= BidiBase.RLM_AFTER;
|
||||
}
|
||||
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
|
||||
uc = LRM_CHAR;
|
||||
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
|
||||
uc = RLM_CHAR;
|
||||
} else {
|
||||
uc = 0;
|
||||
}
|
||||
if (uc != 0) {
|
||||
dest.append(uc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* reverse output */
|
||||
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
|
||||
/* do not insert Bidi controls */
|
||||
for (run = runCount; --run >= 0;) {
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
dest.append(
|
||||
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
} else {
|
||||
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* insert Bidi controls for "inverse Bidi" */
|
||||
|
||||
byte[] dirProps = bidi.dirProps;
|
||||
|
||||
for (run = runCount; --run >= 0;) {
|
||||
/* reverse output */
|
||||
BidiRun bidiRun = bidi.getVisualRun(run);
|
||||
if (bidiRun.isEvenRun()) {
|
||||
if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
|
||||
dest.append(LRM_CHAR);
|
||||
}
|
||||
|
||||
dest.append(
|
||||
doWriteReverse(text, bidiRun.start, bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
|
||||
|
||||
if (dirProps[bidiRun.start] != BidiBase.L) {
|
||||
dest.append(LRM_CHAR);
|
||||
}
|
||||
} else {
|
||||
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
|
||||
dest.append(RLM_CHAR);
|
||||
}
|
||||
|
||||
dest.append(doWriteForward(text, bidiRun.start, bidiRun.limit, options));
|
||||
|
||||
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
|
||||
dest.append(RLM_CHAR);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dest.toString();
|
||||
}
|
||||
}
|
271
sources/main/java/jdk_internal/icu/text/FilteredNormalizer2.java
Normal file
271
sources/main/java/jdk_internal/icu/text/FilteredNormalizer2.java
Normal file
@ -0,0 +1,271 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Normalization filtered by a UnicodeSet. Normalizes portions of the text
|
||||
* contained in the filter set and leaves portions not contained in the filter
|
||||
* set unchanged. Filtering is done via UnicodeSet.span(...,
|
||||
* UnicodeSet.SpanCondition.SIMPLE). Not-in-the-filter text is treated as "is
|
||||
* normalized" and "quick check yes". This class implements all of (and only)
|
||||
* the Normalizer2 API. An instance of this class is unmodifiable/immutable.
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
* @author Markus W. Scherer
|
||||
*/
|
||||
class FilteredNormalizer2 extends Normalizer2 {
|
||||
|
||||
/**
|
||||
* Constructs a filtered normalizer wrapping any Normalizer2 instance and a
|
||||
* filter set. Both are aliased and must not be modified or deleted while this
|
||||
* object is used. The filter set should be frozen; otherwise the performance
|
||||
* will suffer greatly.
|
||||
*
|
||||
* @param n2 wrapped Normalizer2 instance
|
||||
* @param filterSet UnicodeSet which determines the characters to be normalized
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
|
||||
norm2 = n2;
|
||||
set = filterSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
dest.setLength(0);
|
||||
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
@Override
|
||||
public Appendable normalize(CharSequence src, Appendable dest) {
|
||||
if (dest == src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public StringBuilder append(StringBuilder first, CharSequence second) {
|
||||
return normalizeSecondAndAppend(first, second, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
@Override
|
||||
public String getDecomposition(int c) {
|
||||
return set.contains(c) ? norm2.getDecomposition(c) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 49
|
||||
*/
|
||||
@Override
|
||||
public int getCombiningClass(int c) {
|
||||
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public boolean isNormalized(CharSequence s) {
|
||||
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
|
||||
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
if (!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
|
||||
return false;
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public int spanQuickCheckYes(CharSequence s) {
|
||||
UnicodeSet.SpanCondition spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < s.length();) {
|
||||
int spanLimit = set.span(s, prevSpanLimit, spanCondition);
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
int yesLimit = prevSpanLimit + norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
|
||||
if (yesLimit < spanLimit) {
|
||||
return yesLimit;
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
return s.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) {
|
||||
return !set.contains(c) || norm2.hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
// Internal: No argument checking, and appends to dest.
|
||||
// Pass as input spanCondition the one that is likely to yield a non-zero
|
||||
// span length at the start of src.
|
||||
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
|
||||
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
|
||||
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue
|
||||
// after
|
||||
// an in-filter prefix.
|
||||
private Appendable normalize(CharSequence src, Appendable dest, UnicodeSet.SpanCondition spanCondition) {
|
||||
// Don't throw away destination buffer between iterations.
|
||||
StringBuilder tempDest = new StringBuilder();
|
||||
try {
|
||||
for (int prevSpanLimit = 0; prevSpanLimit < src.length();) {
|
||||
int spanLimit = set.span(src, prevSpanLimit, spanCondition);
|
||||
int spanLength = spanLimit - prevSpanLimit;
|
||||
if (spanCondition == UnicodeSet.SpanCondition.NOT_CONTAINED) {
|
||||
if (spanLength != 0) {
|
||||
dest.append(src, prevSpanLimit, spanLimit);
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.SIMPLE;
|
||||
} else {
|
||||
if (spanLength != 0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
|
||||
}
|
||||
spanCondition = UnicodeSet.SpanCondition.NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit = spanLimit;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, boolean doNormalize) {
|
||||
if (first == second) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (first.length() == 0) {
|
||||
if (doNormalize) {
|
||||
return normalize(second, first);
|
||||
} else {
|
||||
return first.append(second);
|
||||
}
|
||||
}
|
||||
// merge the in-filter suffix of the first string with the in-filter prefix of
|
||||
// the second
|
||||
int prefixLimit = set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
|
||||
if (prefixLimit != 0) {
|
||||
CharSequence prefix = second.subSequence(0, prefixLimit);
|
||||
int suffixStart = set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
|
||||
if (suffixStart == 0) {
|
||||
if (doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(first, prefix);
|
||||
} else {
|
||||
norm2.append(first, prefix);
|
||||
}
|
||||
} else {
|
||||
StringBuilder middle = new StringBuilder(first.subSequence(suffixStart, first.length()));
|
||||
if (doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(middle, prefix);
|
||||
} else {
|
||||
norm2.append(middle, prefix);
|
||||
}
|
||||
first.delete(suffixStart, 0x7fffffff).append(middle);
|
||||
}
|
||||
}
|
||||
if (prefixLimit < second.length()) {
|
||||
CharSequence rest = second.subSequence(prefixLimit, second.length());
|
||||
if (doNormalize) {
|
||||
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
|
||||
} else {
|
||||
first.append(rest);
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
private Normalizer2 norm2;
|
||||
private UnicodeSet set;
|
||||
};
|
288
sources/main/java/jdk_internal/icu/text/Normalizer2.java
Normal file
288
sources/main/java/jdk_internal/icu/text/Normalizer2.java
Normal file
@ -0,0 +1,288 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.impl.Norm2AllModes;
|
||||
|
||||
/**
|
||||
* Unicode normalization functionality for standard Unicode normalization or for
|
||||
* using custom mapping tables. All instances of this class are
|
||||
* unmodifiable/immutable. The Normalizer2 class is not intended for public
|
||||
* subclassing.
|
||||
* <p>
|
||||
* The primary functions are to produce a normalized string and to detect
|
||||
* whether a string is already normalized. The most commonly used normalization
|
||||
* forms are those defined in
|
||||
* <a href="http://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
|
||||
* Unicode Normalization Forms</a>. However, this API supports additional
|
||||
* normalization forms for specialized purposes. For example, NFKC_Casefold is
|
||||
* provided via getInstance("nfkc_cf", COMPOSE) and can be used in
|
||||
* implementations of UTS #46.
|
||||
* <p>
|
||||
* Not only are the standard compose and decompose modes supplied, but
|
||||
* additional modes are provided as documented in the Mode enum.
|
||||
* <p>
|
||||
* Some of the functions in this class identify normalization boundaries. At a
|
||||
* normalization boundary, the portions of the string before it and starting
|
||||
* from it do not interact and can be handled independently.
|
||||
* <p>
|
||||
* The spanQuickCheckYes() stops at a normalization boundary. When the goal is a
|
||||
* normalized string, then the text before the boundary can be copied, and the
|
||||
* remainder can be processed with normalizeSecondAndAppend().
|
||||
* <p>
|
||||
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test
|
||||
* whether a character is guaranteed to be at a normalization boundary,
|
||||
* regardless of context. This is used for moving from one normalization
|
||||
* boundary to the next or preceding boundary, and for performing iterative
|
||||
* normalization.
|
||||
* <p>
|
||||
* Iterative normalization is useful when only a small portion of a longer
|
||||
* string needs to be processed. For example, in ICU, iterative normalization is
|
||||
* used by the NormalizationTransliterator (to avoid replacing
|
||||
* already-normalized text) and ucol_nextSortKeyPart() (to process only the
|
||||
* substring for which sort key bytes are computed).
|
||||
* <p>
|
||||
* The set of normalization boundaries returned by these functions may not be
|
||||
* complete: There may be more boundaries that could be returned. Different
|
||||
* functions may return different boundaries.
|
||||
*
|
||||
* @stable ICU 4.4
|
||||
* @author Markus W. Scherer
|
||||
*/
|
||||
public abstract class Normalizer2 {
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFC normalization. Same as
|
||||
* getInstance(null, "nfc", Mode.COMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFCInstance() {
|
||||
return Norm2AllModes.getNFCInstance().comp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFD normalization. Same as
|
||||
* getInstance(null, "nfc", Mode.DECOMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFDInstance() {
|
||||
return Norm2AllModes.getNFCInstance().decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFKC normalization. Same as
|
||||
* getInstance(null, "nfkc", Mode.COMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFKCInstance() {
|
||||
return Norm2AllModes.getNFKCInstance().comp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Normalizer2 instance for Unicode NFKD normalization. Same as
|
||||
* getInstance(null, "nfkc", Mode.DECOMPOSE). Returns an unmodifiable singleton
|
||||
* instance.
|
||||
*
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public static Normalizer2 getNFKDInstance() {
|
||||
return Norm2AllModes.getNFKCInstance().decomp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the normalized form of the source string.
|
||||
*
|
||||
* @param src source string
|
||||
* @return normalized src
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public String normalize(CharSequence src) {
|
||||
if (src instanceof String) {
|
||||
// Fastpath: Do not construct a new String if the src is a String
|
||||
// and is already normalized.
|
||||
int spanLength = spanQuickCheckYes(src);
|
||||
if (spanLength == src.length()) {
|
||||
return (String) src;
|
||||
}
|
||||
if (spanLength != 0) {
|
||||
StringBuilder sb = new StringBuilder(src.length()).append(src, 0, spanLength);
|
||||
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
|
||||
}
|
||||
}
|
||||
return normalize(src, new StringBuilder(src.length())).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination string
|
||||
* (replacing its contents) and returns the destination string. The source and
|
||||
* destination strings must be different objects.
|
||||
*
|
||||
* @param src source string
|
||||
* @param dest destination string; its contents is replaced with normalized src
|
||||
* @return dest
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination Appendable
|
||||
* and returns the destination Appendable. The source and destination strings
|
||||
* must be different objects.
|
||||
*
|
||||
* <p>
|
||||
* Any {@link java.io.IOException} is wrapped into a
|
||||
* {@link com.ibm.icu.util.ICUUncheckedIOException}.
|
||||
*
|
||||
* @param src source string
|
||||
* @param dest destination Appendable; gets normalized src appended
|
||||
* @return dest
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public abstract Appendable normalize(CharSequence src, Appendable dest);
|
||||
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string (merging
|
||||
* them at the boundary) and returns the first string. The result is normalized
|
||||
* if the first string was normalized. The first and second strings must be
|
||||
* different objects.
|
||||
*
|
||||
* @param first string, should be normalized
|
||||
* @param second string, will be normalized
|
||||
* @return first
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second);
|
||||
|
||||
/**
|
||||
* Appends the second string to the first string (merging them at the boundary)
|
||||
* and returns the first string. The result is normalized if both the strings
|
||||
* were normalized. The first and second strings must be different objects.
|
||||
*
|
||||
* @param first string, should be normalized
|
||||
* @param second string, should be normalized
|
||||
* @return first
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract StringBuilder append(StringBuilder first, CharSequence second);
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Roughly equivalent to normalizing the
|
||||
* String form of c on a DECOMPOSE Normalizer2 instance, but much faster, and
|
||||
* except that this function returns null if c does not have a decomposition
|
||||
* mapping in this instance's data. This function is independent of the mode of
|
||||
* the Normalizer2.
|
||||
*
|
||||
* @param c code point
|
||||
* @return c's decomposition mapping, if any; otherwise null
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public abstract String getDecomposition(int c);
|
||||
|
||||
/**
|
||||
* Gets the combining class of c. The default implementation returns 0 but all
|
||||
* standard implementations return the Unicode Canonical_Combining_Class value.
|
||||
*
|
||||
* @param c code point
|
||||
* @return c's combining class
|
||||
* @stable ICU 49
|
||||
*/
|
||||
public int getCombiningClass(int c) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized. Internally, in cases where the
|
||||
* quickCheck() method would return "maybe" (which is only possible for the two
|
||||
* COMPOSE modes) this method resolves to "yes" or "no" to provide a definitive
|
||||
* result, at the cost of doing more work in those cases.
|
||||
*
|
||||
* @param s input string
|
||||
* @return true if s is normalized
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract boolean isNormalized(CharSequence s);
|
||||
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string. In other
|
||||
* words, with <code>end=spanQuickCheckYes(s);</code> the substring
|
||||
* <code>s.subSequence(0, end)</code> will pass the quick check with a "yes"
|
||||
* result.
|
||||
* <p>
|
||||
* The returned end index is usually one or more characters before the "no" or
|
||||
* "maybe" character: The end index is at a normalization boundary. (See the
|
||||
* class documentation for more about normalization boundaries.)
|
||||
* <p>
|
||||
* When the goal is a normalized string and most input strings are expected to
|
||||
* be normalized already, then call this method, and if it returns a prefix
|
||||
* shorter than the input string, copy that prefix and use
|
||||
* normalizeSecondAndAppend() for the remainder.
|
||||
*
|
||||
* @param s input string
|
||||
* @return "yes" span end index
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract int spanQuickCheckYes(CharSequence s);
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary before it,
|
||||
* regardless of context. If true, then the character does not
|
||||
* normalization-interact with preceding characters. In other words, a string
|
||||
* containing this character can be normalized by processing portions before
|
||||
* this character and starting from this character independently. This is used
|
||||
* for iterative normalization. See the class documentation for details.
|
||||
*
|
||||
* @param c character to test
|
||||
* @return true if c has a normalization boundary before it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public abstract boolean hasBoundaryBefore(int c);
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass constructors, typically
|
||||
* implicit.)
|
||||
*
|
||||
* @internal deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected Normalizer2() {
|
||||
}
|
||||
}
|
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
@ -0,0 +1,791 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2000-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.bidi.Normalizer;
|
||||
import jdk_internal.icu.impl.Norm2AllModes;
|
||||
|
||||
/**
|
||||
* Unicode Normalization
|
||||
*
|
||||
* <h2>Unicode normalization API</h2>
|
||||
*
|
||||
* <code>normalize</code> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <code>normalize</code> supports the standard normalization forms described in
|
||||
* <a href="http://www.unicode.org/reports/tr15/" target="unicode"> Unicode
|
||||
* Standard Annex #15 — Unicode Normalization Forms</a>.
|
||||
*
|
||||
* Characters with accents or other adornments can be encoded in several
|
||||
* different ways in Unicode. For example, take the character A-acute. In
|
||||
* Unicode, this can be encoded as a single character (the "composed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
||||
* </pre>
|
||||
*
|
||||
* or as two separate characters (the "decomposed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 0041 LATIN CAPITAL LETTER A
|
||||
* 0301 COMBINING ACUTE ACCENT
|
||||
* </pre>
|
||||
*
|
||||
* To a user of your program, however, both of these sequences should be treated
|
||||
* as the same "user-level" character "A with acute accent". When you are
|
||||
* searching or comparing text, you must ensure that these two sequences are
|
||||
* treated equivalently. In addition, you must handle characters with more than
|
||||
* one accent. Sometimes the order of a character's combining accents is
|
||||
* significant, while in other cases accent sequences in different orders are
|
||||
* really equivalent.
|
||||
*
|
||||
* Similarly, the string "ffi" can be encoded as three separate letters:
|
||||
*
|
||||
* <pre>
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0069 LATIN SMALL LETTER I
|
||||
* </pre>
|
||||
*
|
||||
* or as the single character
|
||||
*
|
||||
* <pre>
|
||||
* FB03 LATIN SMALL LIGATURE FFI
|
||||
* </pre>
|
||||
*
|
||||
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
||||
* it shouldn't be in Unicode at all, but it was included for compatibility with
|
||||
* existing character sets that already provided it. The Unicode standard
|
||||
* identifies such characters by giving them "compatibility" decompositions into
|
||||
* the corresponding semantic characters. When sorting and searching, you will
|
||||
* often want to use these mappings.
|
||||
*
|
||||
* <code>normalize</code> helps solve these problems by transforming text into
|
||||
* the canonical composed and decomposed forms as shown in the first example
|
||||
* above. In addition, you can have it perform compatibility decompositions so
|
||||
* that you can treat compatibility characters the same as their equivalents.
|
||||
* Finally, <code>normalize</code> rearranges accents into the proper canonical
|
||||
* order, so that you do not have to worry about accent rearrangement on your
|
||||
* own.
|
||||
*
|
||||
* Form FCD, "Fast C or D", is also designed for collation. It allows to work on
|
||||
* strings that are not necessarily normalized with an algorithm (like in
|
||||
* collation) that works under "canonical closure", i.e., it treats precomposed
|
||||
* characters and their decomposed equivalents the same.
|
||||
*
|
||||
* It is not a normalization form because it does not provide for uniqueness of
|
||||
* representation. Multiple strings may be canonically equivalent (their NFDs
|
||||
* are identical) and may all conform to FCD without being identical themselves.
|
||||
*
|
||||
* The form is defined such that the "raw decomposition", the recursive
|
||||
* canonical decomposition of each character, results in a string that is
|
||||
* canonically ordered. This means that precomposed characters are allowed for
|
||||
* as long as their decompositions do not need canonical reordering.
|
||||
*
|
||||
* Its advantage for a process like collation is that all NFD and most NFC texts
|
||||
* - and many unnormalized texts - already conform to FCD and do not need to be
|
||||
* normalized (NFD) for such a process. The FCD quick check will return YES for
|
||||
* most strings in practice.
|
||||
*
|
||||
* normalize(FCD) may be implemented with NFD.
|
||||
*
|
||||
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence
|
||||
* in Applications): http://www.unicode.org/notes/tn5/#FCD
|
||||
*
|
||||
* ICU collation performs either NFD or FCD normalization automatically if
|
||||
* normalization is turned on for the collator object. Beyond collation and
|
||||
* string search, normalized strings may be useful for string equivalence
|
||||
* comparisons, transliteration/transcription, unique representations, etc.
|
||||
*
|
||||
* The W3C generally recommends to exchange texts in NFC. Note also that most
|
||||
* legacy character encodings use only precomposed forms and often do not encode
|
||||
* any combining marks by themselves. For conversion to such character encodings
|
||||
* the Unicode text needs to be normalized to NFC. For more usage examples, see
|
||||
* the Unicode Standard Annex.
|
||||
*
|
||||
* Note: The Normalizer class also provides API for iterative normalization.
|
||||
* While the setIndex() and getIndex() refer to indices in the underlying
|
||||
* Unicode input text, the next() and previous() methods iterate through
|
||||
* characters in the normalized output. This means that there is not necessarily
|
||||
* a one-to-one correspondence between characters returned by next() and
|
||||
* previous() and the indices passed to and returned from setIndex() and
|
||||
* getIndex(). It is for this reason that Normalizer does not implement the
|
||||
* CharacterIterator interface.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// Original filename in ICU4J: Normalizer.java
|
||||
public final class NormalizerBase implements Cloneable {
|
||||
|
||||
// The input text and our position in it
|
||||
private UCharacterIterator text;
|
||||
private Normalizer2 norm2;
|
||||
private Mode mode;
|
||||
private int options;
|
||||
|
||||
// The normalization buffer is the result of normalization
|
||||
// of the source in [currentIndex..nextIndex] .
|
||||
private int currentIndex;
|
||||
private int nextIndex;
|
||||
|
||||
// A buffer for holding intermediate results
|
||||
private StringBuilder buffer;
|
||||
private int bufferPos;
|
||||
|
||||
// Helper classes to defer loading of normalization data.
|
||||
private static final class ModeImpl {
|
||||
private ModeImpl(Normalizer2 n2) {
|
||||
normalizer2 = n2;
|
||||
}
|
||||
|
||||
private final Normalizer2 normalizer2;
|
||||
}
|
||||
|
||||
private static final class NFDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
|
||||
}
|
||||
|
||||
private static final class NFKDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
|
||||
}
|
||||
|
||||
private static final class NFCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
|
||||
}
|
||||
|
||||
private static final class NFKCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
|
||||
}
|
||||
|
||||
private static final class Unicode32 {
|
||||
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
|
||||
}
|
||||
|
||||
private static final class NFD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Options bit set value to select Unicode 3.2 normalization (except
|
||||
* NormalizationCorrections). At most one Unicode version can be selected at a
|
||||
* time.
|
||||
*
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int UNICODE_3_2 = 0x20;
|
||||
|
||||
public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2;
|
||||
|
||||
/*
|
||||
* Default option for the latest Unicode normalization. This option is provided
|
||||
* mainly for testing. The value zero means that normalization is done with the
|
||||
* fixes for - Corrigendum 4 (Five CJK Canonical Mapping Errors) - Corrigendum 5
|
||||
* (Normalization Idempotency)
|
||||
*/
|
||||
public static final int UNICODE_LATEST = 0x00;
|
||||
|
||||
/**
|
||||
* Constant indicating that the end of the iteration has been reached. This is
|
||||
* guaranteed to have the same value as {@link UCharacterIterator#DONE}.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int DONE = UCharacterIterator.DONE;
|
||||
|
||||
/**
|
||||
* Constants for normalization modes.
|
||||
* <p>
|
||||
* The Mode class is not intended for public subclassing. Only the Mode
|
||||
* constants provided by the Normalizer class should be used, and any fields or
|
||||
* methods should not be called or overridden by users.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public abstract static class Mode {
|
||||
|
||||
/**
|
||||
* Sole constructor
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected Mode() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected abstract Normalizer2 getNormalizer2(int options);
|
||||
}
|
||||
|
||||
private static Mode toMode(Normalizer.Form form) {
|
||||
switch (form) {
|
||||
case NFC:
|
||||
return NFC;
|
||||
case NFD:
|
||||
return NFD;
|
||||
case NFKC:
|
||||
return NFKC;
|
||||
case NFKD:
|
||||
return NFKD;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unexpected normalization form: " + form);
|
||||
}
|
||||
|
||||
private static final class NONEMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return Norm2AllModes.NOOP_NORMALIZER2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKD32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKC32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* No decomposition/composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NONE = new NONEMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFD = new NFDMode();
|
||||
|
||||
/**
|
||||
* Compatibility decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFKD = new NFKDMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition followed by canonical composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFC = new NFCMode();
|
||||
|
||||
public static final Mode NFKC = new NFKCMode();
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator constructors
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of a given string.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this object.
|
||||
* <p>
|
||||
*
|
||||
* @param str The string to be normalized. The normalization will start at the
|
||||
* beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(String str, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance(str);
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(String str, Mode mode) {
|
||||
this(str, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of the given text.
|
||||
* <p>
|
||||
*
|
||||
* @param iter The input text to be normalized. The normalization will start at
|
||||
* the beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance((CharacterIterator) iter.clone());
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode) {
|
||||
this(iter, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clones this {@code NormalizerBase} object. All properties of this object are
|
||||
* duplicated in the new object, including the cloning of any
|
||||
* {@link CharacterIterator} that was passed in to the constructor or to
|
||||
* {@link #setText(CharacterIterator) setText}. However, the text storage
|
||||
* underlying the {@code CharacterIterator} is not duplicated unless the
|
||||
* iterator's {@code clone} method does so.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Object clone() {
|
||||
try {
|
||||
NormalizerBase copy = (NormalizerBase) super.clone();
|
||||
copy.text = (UCharacterIterator) text.clone();
|
||||
copy.mode = mode;
|
||||
copy.options = options;
|
||||
copy.norm2 = norm2;
|
||||
copy.buffer = new StringBuilder(buffer);
|
||||
copy.bufferPos = bufferPos;
|
||||
copy.currentIndex = currentIndex;
|
||||
copy.nextIndex = nextIndex;
|
||||
return copy;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes a {@code String} using the given normalization operation.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this operation. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms, use 0 for
|
||||
* this argument.
|
||||
* <p>
|
||||
*
|
||||
* @param str the input string to be normalized.
|
||||
* @param mode the normalization mode
|
||||
* @param options the optional features to be enabled.
|
||||
* @return String the normalized string
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static String normalize(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).normalize(str);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.normalize(str, toMode(form), options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test if a string is in a given normalization form. This is semantically
|
||||
* equivalent to source.equals(normalize(source, mode)).
|
||||
*
|
||||
* Unlike quickCheck(), this function returns a definitive result, never a
|
||||
* "maybe". For NFD, NFKD, and FCD, both functions work exactly the same. For
|
||||
* NFC and NFKC where quickCheck may return "maybe", this function will perform
|
||||
* further tests to arrive at a true/false result.
|
||||
*
|
||||
* @param str the input string to be checked to see if it is normalized
|
||||
* @param mode the normalization mode
|
||||
* @param options Options for use with exclusion set and tailored Normalization
|
||||
* The only option that is currently recognized is UNICODE_3_2
|
||||
* @see #isNormalized
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static boolean isNormalized(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).isNormalized(str);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), options);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iteration API
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return the current character in the normalized text.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int current() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
return buffer.codePointAt(bufferPos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next character in the normalized text and advance the iteration
|
||||
* position by one. If the end of the text has already been reached,
|
||||
* {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int next() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
int c = buffer.codePointAt(bufferPos);
|
||||
bufferPos += Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the previous character in the normalized text and decrement the
|
||||
* iteration position by one. If the beginning of the text has already been
|
||||
* reached, {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int previous() {
|
||||
if (bufferPos > 0 || previousNormalize()) {
|
||||
int c = buffer.codePointBefore(bufferPos);
|
||||
bufferPos -= Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the index to the beginning of the text. This is equivalent to
|
||||
* setIndexOnly(startIndex)).
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void reset() {
|
||||
text.setIndex(0);
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized,
|
||||
* without any immediate normalization. After setIndexOnly(), getIndex() will
|
||||
* return the same index that is specified here.
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setIndexOnly(int index) {
|
||||
text.setIndex(index); // validates index
|
||||
currentIndex = nextIndex = index;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized and
|
||||
* return the first normalized character at that position.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em> text, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* normalized <em>output</em>. This means that there is not necessarily a
|
||||
* one-to-one correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
* <p>
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
*
|
||||
* @return the first normalized character that is the result of iterating
|
||||
* forward starting at the given index.
|
||||
*
|
||||
* @throws IllegalArgumentException if the given index is less than
|
||||
* {@link #getBeginIndex} or greater than
|
||||
* {@link #getEndIndex}. deprecated ICU 3.2
|
||||
* @obsolete ICU 3.2
|
||||
*/
|
||||
public int setIndex(int index) {
|
||||
setIndexOnly(index);
|
||||
return current();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the start of the input text. This is the begin index of
|
||||
* the {@code CharacterIterator} or the start (i.e. 0) of the {@code String}
|
||||
* over which this {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use startIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #startIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getBeginIndex() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use endIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #endIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getEndIndex() {
|
||||
return endIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the current iteration position in the input text that is being
|
||||
* normalized. This method is useful in applications such as searching, where
|
||||
* you need to be able to determine the position in the input text that
|
||||
* corresponds to a given normalized output character.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* <em>output</em>. This means that there is not necessarily a one-to-one
|
||||
* correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int getIndex() {
|
||||
if (bufferPos < buffer.length()) {
|
||||
return currentIndex;
|
||||
} else {
|
||||
return nextIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int endIndex() {
|
||||
return text.getLength();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator attributes
|
||||
// -------------------------------------------------------------------------
|
||||
/**
|
||||
* Set the normalization mode for this object.
|
||||
* <p>
|
||||
* <b>Note:</b>If the normalization mode is changed while iterating over a
|
||||
* string, calls to {@link #next} and {@link #previous} may return previously
|
||||
* buffers characters in the old normalization mode until the iteration is able
|
||||
* to re-sync at the next base character. It is safest to call {@link #setText
|
||||
* setText()}, {@link #first}, {@link #last}, etc. after calling
|
||||
* {@code setMode}.
|
||||
* <p>
|
||||
*
|
||||
* @param newMode the new mode for this {@code NormalizerBase}. The supported
|
||||
* modes are:
|
||||
* <ul>
|
||||
* <li>{@link #NFC} - Unicode canonical decompositiion followed
|
||||
* by canonical composition.
|
||||
* <li>{@link #NFKC} - Unicode compatibility decompositiion
|
||||
* follwed by canonical composition.
|
||||
* <li>{@link #NFD} - Unicode canonical decomposition
|
||||
* <li>{@link #NFKD} - Unicode compatibility decomposition.
|
||||
* <li>{@link #NONE} - Do nothing but return characters from the
|
||||
* underlying input text.
|
||||
* </ul>
|
||||
*
|
||||
* @see #getMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setMode(Mode newMode) {
|
||||
mode = newMode;
|
||||
norm2 = mode.getNormalizer2(options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the basic operation performed by this {@code NormalizerBase}
|
||||
*
|
||||
* @see #setMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Mode getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(String newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(CharacterIterator newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
private void clearBuffer() {
|
||||
buffer.setLength(0);
|
||||
bufferPos = 0;
|
||||
}
|
||||
|
||||
private boolean nextNormalize() {
|
||||
clearBuffer();
|
||||
currentIndex = nextIndex;
|
||||
text.setIndex(nextIndex);
|
||||
// Skip at least one character so we make progress.
|
||||
int c = text.nextCodePoint();
|
||||
if (c < 0) {
|
||||
return false;
|
||||
}
|
||||
StringBuilder segment = new StringBuilder().appendCodePoint(c);
|
||||
while ((c = text.nextCodePoint()) >= 0) {
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
text.moveCodePointIndex(-1);
|
||||
break;
|
||||
}
|
||||
segment.appendCodePoint(c);
|
||||
}
|
||||
nextIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
private boolean previousNormalize() {
|
||||
clearBuffer();
|
||||
nextIndex = currentIndex;
|
||||
text.setIndex(currentIndex);
|
||||
StringBuilder segment = new StringBuilder();
|
||||
int c;
|
||||
while ((c = text.previousCodePoint()) >= 0) {
|
||||
if (c <= 0xffff) {
|
||||
segment.insert(0, (char) c);
|
||||
} else {
|
||||
segment.insert(0, Character.toChars(c));
|
||||
}
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
bufferPos = buffer.length();
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
}
|
124
sources/main/java/jdk_internal/icu/text/Replaceable.java
Normal file
124
sources/main/java/jdk_internal/icu/text/Replaceable.java
Normal file
@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* <code>Replaceable</code> is an interface representing a string of characters
|
||||
* that supports the replacement of a range of itself with a new string of
|
||||
* characters. It is used by APIs that change a piece of text while retaining
|
||||
* metadata. Metadata is data other than the Unicode characters returned by
|
||||
* char32At(). One example of metadata is style attributes; another is an edit
|
||||
* history, marking each character with an author and revision number.
|
||||
*
|
||||
* <p>
|
||||
* An implicit aspect of the <code>Replaceable</code> API is that during a
|
||||
* replace operation, new characters take on the metadata of the old characters.
|
||||
* For example, if the string "the <b>bold</b> font" has range (4, 8) replaced
|
||||
* with "strong", then it becomes "the <b>strong</b> font".
|
||||
*
|
||||
* <p>
|
||||
* <code>Replaceable</code> specifies ranges using a start offset and a limit
|
||||
* offset. The range of characters thus specified includes the characters at
|
||||
* offset start..limit-1. That is, the start offset is inclusive, and the limit
|
||||
* offset is exclusive.
|
||||
*
|
||||
* <p>
|
||||
* <code>Replaceable</code> also includes API to access characters in the
|
||||
* string: <code>length()</code>, <code>charAt()</code>,
|
||||
* <code>char32At()</code>, and <code>extractBetween()</code>.
|
||||
*
|
||||
* <p>
|
||||
* For a subclass to support metadata, typical behavior of
|
||||
* <code>replace()</code> is the following:
|
||||
* <ul>
|
||||
* <li>Set the metadata of the new text to the metadata of the first character
|
||||
* replaced</li>
|
||||
* <li>If no characters are replaced, use the metadata of the previous
|
||||
* character</li>
|
||||
* <li>If there is no previous character (i.e. start == 0), use the following
|
||||
* character</li>
|
||||
* <li>If there is no following character (i.e. the replaceable was empty), use
|
||||
* default metadata</li>
|
||||
* <li>If the code point U+FFFF is seen, it should be interpreted as a special
|
||||
* marker having no metadata</li>
|
||||
* </ul>
|
||||
* If this is not the behavior, the subclass should document any differences.
|
||||
*
|
||||
* <p>
|
||||
* Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public interface Replaceable {
|
||||
/**
|
||||
* Returns the number of 16-bit code units in the text.
|
||||
*
|
||||
* @return number of 16-bit code units in text
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
int length();
|
||||
|
||||
/**
|
||||
* Returns the 16-bit code unit at the given offset into the text.
|
||||
*
|
||||
* @param offset an integer between 0 and <code>length()</code>-1 inclusive
|
||||
* @return 16-bit code unit of text at given offset
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
char charAt(int offset);
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination character array. The
|
||||
* first character to be copied is at index <code>srcStart</code>; the last
|
||||
* character to be copied is at index <code>srcLimit-1</code> (thus the total
|
||||
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code> starting at index
|
||||
* <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive;
|
||||
* {@code 0 <= start <= limit}.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* {@code start <= limit <= length()}.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
|
||||
}
|
121
sources/main/java/jdk_internal/icu/text/ReplaceableString.java
Normal file
121
sources/main/java/jdk_internal/icu/text/ReplaceableString.java
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
/**
|
||||
* <code>ReplaceableString</code> is an adapter class that implements the
|
||||
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
|
||||
*
|
||||
* <p>
|
||||
* <em>Note:</em> This class does not support attributes and is not intended for
|
||||
* general use. Most clients will need to implement {@link Replaceable} in their
|
||||
* text representation class.
|
||||
*
|
||||
* <p>
|
||||
* Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
|
||||
private StringBuffer buf;
|
||||
|
||||
/**
|
||||
* Construct a new object with the given initial contents.
|
||||
*
|
||||
* @param str initial contents
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(String str) {
|
||||
buf = new StringBuffer(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new object using <code>buf</code> for internal storage. The
|
||||
* contents of <code>buf</code> at the time of construction are used as the
|
||||
* initial contents. <em>Note! Modifications to <code>buf</code> will modify
|
||||
* this object, and vice versa.</em>
|
||||
*
|
||||
* @param buf object to be used as internal storage
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(StringBuffer buf) {
|
||||
this.buf = buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int length() {
|
||||
return buf.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the character at the given position in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*
|
||||
* @param offset offset into the contents, from 0 to <code>length()</code> - 1
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public char charAt(int offset) {
|
||||
return buf.charAt(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination character array. The
|
||||
* first character to be copied is at index <code>srcStart</code>; the last
|
||||
* character to be copied is at index <code>srcLimit-1</code> (thus the total
|
||||
* number of characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code> starting at index
|
||||
* <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive;
|
||||
* {@code 0 <= start <= limit}.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* {@code start <= limit <= length()}.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
|
||||
if (srcStart != srcLimit) {
|
||||
buf.getChars(srcStart, srcLimit, dst, dstStart);
|
||||
}
|
||||
}
|
||||
}
|
493
sources/main/java/jdk_internal/icu/text/StringPrep.java
Normal file
493
sources/main/java/jdk_internal/icu/text/StringPrep.java
Normal file
@ -0,0 +1,493 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2004, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
//
|
||||
// CHANGELOG
|
||||
// 2005-05-19 Edward Wang
|
||||
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
|
||||
// - move from package com.ibm.icu.text to package sun.net.idn
|
||||
// - use ParseException instead of StringPrepParseException
|
||||
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
|
||||
// - remove all @deprecated tag to make compiler happy
|
||||
// 2007-08-14 Martin Buchholz
|
||||
// - remove redundant casts
|
||||
//
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import jdk_internal.bidi.Normalizer;
|
||||
import jdk_internal.bidi.ParseException;
|
||||
import jdk_internal.bidi.SunNormalizer;
|
||||
import jdk_internal.icu.impl.CharTrie;
|
||||
import jdk_internal.icu.impl.StringPrepDataReader;
|
||||
import jdk_internal.icu.impl.Trie;
|
||||
import jdk_internal.icu.lang.UCharacter;
|
||||
import jdk_internal.icu.lang.UCharacterDirection;
|
||||
import jdk_internal.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* StringPrep API implements the StingPrep framework as described by
|
||||
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. StringPrep
|
||||
* prepares Unicode strings for use in network protocols. Profiles of StingPrep
|
||||
* are set of rules and data according to which the Unicode Strings are
|
||||
* prepared. Each profiles contains tables which describe how a code point
|
||||
* should be treated. The tables are broadly classied into
|
||||
* <ul>
|
||||
* <li>Unassigned Table: Contains code points that are unassigned in the Unicode
|
||||
* Version supported by StringPrep. Currently RFC 3454 supports Unicode 3.2.
|
||||
* </li>
|
||||
* <li>Prohibited Table: Contains code points that are prohibted from the output
|
||||
* of the StringPrep processing function.</li>
|
||||
* <li>Mapping Table: Contains code ponts that are deleted from the output or
|
||||
* case mapped.</li>
|
||||
* </ul>
|
||||
*
|
||||
* The procedure for preparing Unicode strings:
|
||||
* <ol>
|
||||
* <li>Map: For each character in the input, check if it has a mapping and, if
|
||||
* so, replace it with its mapping.</li>
|
||||
* <li>Normalize: Possibly normalize the result of step 1 using Unicode
|
||||
* normalization.</li>
|
||||
* <li>Prohibit: Check for any characters that are not allowed in the output. If
|
||||
* any are found, return an error.</li>
|
||||
* <li>Check bidi: Possibly check for right-to-left characters, and if any are
|
||||
* found, make sure that the whole string satisfies the requirements for
|
||||
* bidirectional strings. If the string does not satisfy the requirements for
|
||||
* bidirectional strings, return an error.</li>
|
||||
* </ol>
|
||||
*
|
||||
* @author Ram Viswanadha
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public final class StringPrep {
|
||||
/**
|
||||
* Option to prohibit processing of unassigned code points in the input
|
||||
*
|
||||
* @see #prepare
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public static final int DEFAULT = 0x0000;
|
||||
|
||||
/**
|
||||
* Option to allow processing of unassigned code points in the input
|
||||
*
|
||||
* @see #prepare
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public static final int ALLOW_UNASSIGNED = 0x0001;
|
||||
|
||||
private static final int UNASSIGNED = 0x0000;
|
||||
private static final int MAP = 0x0001;
|
||||
private static final int PROHIBITED = 0x0002;
|
||||
private static final int DELETE = 0x0003;
|
||||
private static final int TYPE_LIMIT = 0x0004;
|
||||
|
||||
private static final int NORMALIZATION_ON = 0x0001;
|
||||
private static final int CHECK_BIDI_ON = 0x0002;
|
||||
|
||||
private static final int TYPE_THRESHOLD = 0xFFF0;
|
||||
private static final int MAX_INDEX_VALUE = 0x3FBF; /* 16139 */
|
||||
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
|
||||
|
||||
/* indexes[] value names */
|
||||
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
|
||||
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
|
||||
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /*
|
||||
* The index of Unicode version of last entry in
|
||||
* NormalizationCorrections.txt
|
||||
*/
|
||||
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /*
|
||||
* The starting index of 1 UChar mapping index in the
|
||||
* mapping data array
|
||||
*/
|
||||
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /*
|
||||
* The starting index of 2 UChars mapping index in
|
||||
* the mapping data array
|
||||
*/
|
||||
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
|
||||
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
|
||||
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
|
||||
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
|
||||
|
||||
/**
|
||||
* Default buffer size of datafile
|
||||
*/
|
||||
private static final int DATA_BUFFER_SIZE = 25000;
|
||||
|
||||
/* Wrappers for Trie implementations */
|
||||
private static final class StringPrepTrieImpl implements Trie.DataManipulate {
|
||||
private CharTrie sprepTrie = null;
|
||||
|
||||
/**
|
||||
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's data the
|
||||
* index array offset of the indexes for that lead surrogate.
|
||||
*
|
||||
* @param property data value for a surrogate from the trie, including the
|
||||
* folding offset
|
||||
* @return data offset or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
public int getFoldingOffset(int value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// CharTrie implementation for reading the trie data
|
||||
private StringPrepTrieImpl sprepTrieImpl;
|
||||
// Indexes read from the data file
|
||||
private int[] indexes;
|
||||
// mapping data read from the data file
|
||||
private char[] mappingData;
|
||||
// format version of the data file
|
||||
private byte[] formatVersion;
|
||||
// the version of Unicode supported by the data file
|
||||
private VersionInfo sprepUniVer;
|
||||
// the Unicode version of last entry in the
|
||||
// NormalizationCorrections.txt file if normalization
|
||||
// is turned on
|
||||
private VersionInfo normCorrVer;
|
||||
// Option to turn on Normalization
|
||||
private boolean doNFKC;
|
||||
// Option to turn on checking for BiDi rules
|
||||
private boolean checkBiDi;
|
||||
|
||||
private char getCodePointValue(int ch) {
|
||||
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
|
||||
}
|
||||
|
||||
private static VersionInfo getVersionInfo(int comp) {
|
||||
int micro = comp & 0xFF;
|
||||
int milli = (comp >> 8) & 0xFF;
|
||||
int minor = (comp >> 16) & 0xFF;
|
||||
int major = (comp >> 24) & 0xFF;
|
||||
return VersionInfo.getInstance(major, minor, milli, micro);
|
||||
}
|
||||
|
||||
private static VersionInfo getVersionInfo(byte[] version) {
|
||||
if (version.length != 4) {
|
||||
return null;
|
||||
}
|
||||
return VersionInfo.getInstance((int) version[0], (int) version[1], (int) version[2], (int) version[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an StringPrep object after reading the input stream. The object does
|
||||
* not hold a reference to the input steam, so the stream can be closed after
|
||||
* the method returns.
|
||||
*
|
||||
* @param inputStream The stream for reading the StringPrep profile binarySun
|
||||
* @throws IOException
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public StringPrep(InputStream inputStream) throws IOException {
|
||||
|
||||
BufferedInputStream b = new BufferedInputStream(inputStream, DATA_BUFFER_SIZE);
|
||||
|
||||
StringPrepDataReader reader = new StringPrepDataReader(b);
|
||||
|
||||
// read the indexes
|
||||
indexes = reader.readIndexes(INDEX_TOP);
|
||||
|
||||
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
|
||||
|
||||
// indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
|
||||
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
|
||||
// load the rest of the data and initialize the data members
|
||||
reader.read(sprepBytes, mappingData);
|
||||
|
||||
sprepTrieImpl = new StringPrepTrieImpl();
|
||||
sprepTrieImpl.sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes), sprepTrieImpl);
|
||||
|
||||
// get the data format version
|
||||
formatVersion = reader.getDataFormatVersion();
|
||||
|
||||
// get the options
|
||||
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
|
||||
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
|
||||
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
|
||||
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
|
||||
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
|
||||
if (normUniVer.compareTo(sprepUniVer) < 0 && /*
|
||||
* the Unicode version of SPREP file must be less than the
|
||||
* Unicode Vesion of the normalization data
|
||||
*/
|
||||
normUniVer.compareTo(normCorrVer) < 0
|
||||
&& /*
|
||||
* the Unicode version of the NormalizationCorrections.txt file should be less
|
||||
* than the Unicode Vesion of the normalization data
|
||||
*/
|
||||
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on */
|
||||
) {
|
||||
throw new IOException("Normalization Correction version not supported");
|
||||
}
|
||||
b.close();
|
||||
}
|
||||
|
||||
private static final class Values {
|
||||
boolean isIndex;
|
||||
int value;
|
||||
int type;
|
||||
|
||||
public void reset() {
|
||||
isIndex = false;
|
||||
value = 0;
|
||||
type = -1;
|
||||
}
|
||||
}
|
||||
|
||||
private static final void getValues(char trieWord, Values values) {
|
||||
values.reset();
|
||||
if (trieWord == 0) {
|
||||
/*
|
||||
* Initial value stored in the mapping table just return TYPE_LIMIT .. so that
|
||||
* the source codepoint is copied to the destination
|
||||
*/
|
||||
values.type = TYPE_LIMIT;
|
||||
} else if (trieWord >= TYPE_THRESHOLD) {
|
||||
values.type = (trieWord - TYPE_THRESHOLD);
|
||||
} else {
|
||||
/* get the type */
|
||||
values.type = MAP;
|
||||
/* ascertain if the value is index or delta */
|
||||
if ((trieWord & 0x02) > 0) {
|
||||
values.isIndex = true;
|
||||
values.value = trieWord >> 2; // mask off the lower 2 bits and shift
|
||||
|
||||
} else {
|
||||
values.isIndex = false;
|
||||
values.value = (trieWord << 16) >> 16;
|
||||
values.value = (values.value >> 2);
|
||||
|
||||
}
|
||||
|
||||
if ((trieWord >> 2) == MAX_INDEX_VALUE) {
|
||||
values.type = DELETE;
|
||||
values.isIndex = false;
|
||||
values.value = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private StringBuffer map(UCharacterIterator iter, int options) throws ParseException {
|
||||
|
||||
Values val = new Values();
|
||||
char result = 0;
|
||||
int ch = UCharacterIterator.DONE;
|
||||
StringBuffer dest = new StringBuffer();
|
||||
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
|
||||
|
||||
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
|
||||
|
||||
result = getCodePointValue(ch);
|
||||
getValues(result, val);
|
||||
|
||||
// check if the source codepoint is unassigned
|
||||
if (val.type == UNASSIGNED && allowUnassigned == false) {
|
||||
throw new ParseException("An unassigned code point was found in the input " + iter.getText(),
|
||||
iter.getIndex());
|
||||
} else if ((val.type == MAP)) {
|
||||
int index, length;
|
||||
|
||||
if (val.isIndex) {
|
||||
index = val.value;
|
||||
if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
|
||||
&& index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 1;
|
||||
} else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
|
||||
&& index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 2;
|
||||
} else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
|
||||
&& index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
|
||||
length = 3;
|
||||
} else {
|
||||
length = mappingData[index++];
|
||||
}
|
||||
/* copy mapping to destination */
|
||||
dest.append(mappingData, index, length);
|
||||
continue;
|
||||
|
||||
} else {
|
||||
ch -= val.value;
|
||||
}
|
||||
} else if (val.type == DELETE) {
|
||||
// just consume the codepoint and contine
|
||||
continue;
|
||||
}
|
||||
// copy the source into destination
|
||||
UTF16.append(dest, ch);
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
private StringBuffer normalize(StringBuffer src) {
|
||||
/*
|
||||
* Option UNORM_BEFORE_PRI_29:
|
||||
*
|
||||
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
|
||||
* requires strict adherence to Unicode 3.2 normalization, including buggy
|
||||
* composition from before fixing Public Review Issue #29. Note that this
|
||||
* results in some valid but nonsensical text to be either corrupted or
|
||||
* rejected, depending on the text. See
|
||||
* http://www.unicode.org/review/resolved-pri.html#pri29 See unorm.cpp and
|
||||
* cnormtst.c
|
||||
*/
|
||||
return new StringBuffer(
|
||||
SunNormalizer.normalize(src.toString(), Normalizer.Form.NFKC, SunNormalizer.UNICODE_3_2));
|
||||
}
|
||||
|
||||
/*
|
||||
* boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if(
|
||||
* (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; }
|
||||
*/
|
||||
/*
|
||||
* 1) Map -- For each character in the input, check if it has a mapping and, if
|
||||
* so, replace it with its mapping.
|
||||
*
|
||||
* 2) Normalize -- Possibly normalize the result of step 1 using Unicode
|
||||
* normalization.
|
||||
*
|
||||
* 3) Prohibit -- Check for any characters that are not allowed in the output.
|
||||
* If any are found, return an error.
|
||||
*
|
||||
* 4) Check bidi -- Possibly check for right-to-left characters, and if any are
|
||||
* found, make sure that the whole string satisfies the requirements for
|
||||
* bidirectional strings. If the string does not satisfy the requirements for
|
||||
* bidirectional strings, return an error. [Unicode3.2] defines several
|
||||
* bidirectional categories; each character has one bidirectional category
|
||||
* assigned to it. For the purposes of the requirements below, an
|
||||
* "RandALCat character" is a character that has Unicode bidirectional
|
||||
* categories "R" or "AL"; an "LCat character" is a character that has Unicode
|
||||
* bidirectional category "L". Note
|
||||
*
|
||||
*
|
||||
* that there are many characters which fall in neither of the above
|
||||
* definitions; Latin digits (<U+0030> through <U+0039>) are examples of this
|
||||
* because they have bidirectional category "EN".
|
||||
*
|
||||
* In any profile that specifies bidirectional character handling, all three of
|
||||
* the following requirements MUST be met:
|
||||
*
|
||||
* 1) The characters in section 5.8 MUST be prohibited.
|
||||
*
|
||||
* 2) If a string contains any RandALCat character, the string MUST NOT contain
|
||||
* any LCat character.
|
||||
*
|
||||
* 3) If a string contains any RandALCat character, a RandALCat character MUST
|
||||
* be the first character of the string, and a RandALCat character MUST be the
|
||||
* last character of the string.
|
||||
*/
|
||||
/**
|
||||
* Prepare the input buffer for use in applications with the given profile. This
|
||||
* operation maps, normalizes(NFKC), checks for prohited and BiDi characters in
|
||||
* the order defined by RFC 3454 depending on the options specified in the
|
||||
* profile.
|
||||
*
|
||||
* @param src A UCharacterIterator object containing the source string
|
||||
* @param options A bit set of options:
|
||||
*
|
||||
* - StringPrep.NONE Prohibit processing of unassigned code
|
||||
* points in the input
|
||||
*
|
||||
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points
|
||||
* are in the input as normal Unicode code points.
|
||||
*
|
||||
* @return StringBuffer A StringBuffer containing the output
|
||||
* @throws ParseException
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public StringBuffer prepare(UCharacterIterator src, int options) throws ParseException {
|
||||
|
||||
// map
|
||||
StringBuffer mapOut = map(src, options);
|
||||
StringBuffer normOut = mapOut;// initialize
|
||||
|
||||
if (doNFKC) {
|
||||
// normalize
|
||||
normOut = normalize(mapOut);
|
||||
}
|
||||
|
||||
int ch;
|
||||
char result;
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
|
||||
Values val = new Values();
|
||||
int direction = UCharacterDirection.CHAR_DIRECTION_COUNT,
|
||||
firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
|
||||
int rtlPos = -1, ltrPos = -1;
|
||||
boolean rightToLeft = false, leftToRight = false;
|
||||
|
||||
while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
|
||||
result = getCodePointValue(ch);
|
||||
getValues(result, val);
|
||||
|
||||
if (val.type == PROHIBITED) {
|
||||
throw new ParseException("A prohibited code point was found in the input" + iter.getText(), val.value);
|
||||
}
|
||||
|
||||
direction = UCharacter.getDirection(ch);
|
||||
if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
|
||||
firstCharDir = direction;
|
||||
}
|
||||
if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
|
||||
leftToRight = true;
|
||||
ltrPos = iter.getIndex() - 1;
|
||||
}
|
||||
if (direction == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
|
||||
rightToLeft = true;
|
||||
rtlPos = iter.getIndex() - 1;
|
||||
}
|
||||
}
|
||||
if (checkBiDi == true) {
|
||||
// satisfy 2
|
||||
if (leftToRight == true && rightToLeft == true) {
|
||||
throw new ParseException(
|
||||
"The input does not conform to the rules for BiDi code points." + iter.getText(),
|
||||
(rtlPos > ltrPos) ? rtlPos : ltrPos);
|
||||
}
|
||||
|
||||
// satisfy 3
|
||||
if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
|
||||
&& (direction == UCharacterDirection.RIGHT_TO_LEFT
|
||||
|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
|
||||
throw new ParseException(
|
||||
"The input does not conform to the rules for BiDi code points." + iter.getText(),
|
||||
(rtlPos > ltrPos) ? rtlPos : ltrPos);
|
||||
}
|
||||
}
|
||||
return normOut;
|
||||
|
||||
}
|
||||
}
|
326
sources/main/java/jdk_internal/icu/text/UCharacterIterator.java
Normal file
326
sources/main/java/jdk_internal/icu/text/UCharacterIterator.java
Normal file
@ -0,0 +1,326 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.icu.impl.CharacterIteratorWrapper;
|
||||
import jdk_internal.icu.impl.ReplaceableUCharacterIterator;
|
||||
import jdk_internal.icu.impl.UCharacterProperty;
|
||||
|
||||
/**
|
||||
* Abstract class that defines an API for iteration on text objects.This is an
|
||||
* interface for forward and backward iteration and random access into a text
|
||||
* object. Forward iteration is done with post-increment and backward iteration
|
||||
* is done with pre-decrement semantics, while the
|
||||
* <code>java.text.CharacterIterator</code> interface methods provided forward
|
||||
* iteration with "pre-increment" and backward iteration with pre-decrement
|
||||
* semantics. This API is more efficient for forward iteration over code points.
|
||||
* The other major difference is that this API can do both code unit and code
|
||||
* point iteration, <code>java.text.CharacterIterator</code> can only iterate
|
||||
* over code units and is limited to BMP (0 - 0xFFFF)
|
||||
*
|
||||
* @author Ram
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract class UCharacterIterator implements Cloneable {
|
||||
|
||||
/**
|
||||
* Protected default constructor for the subclasses
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
protected UCharacterIterator() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text. Moved from
|
||||
* UForwardCharacterIterator.java
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
// static final methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a source string.
|
||||
*
|
||||
* @param source a string
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(String source) {
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a source StringBuffer.
|
||||
*
|
||||
* @param source an string buffer of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(StringBuffer source) {
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a CharacterIterator.
|
||||
*
|
||||
* @param source a valid CharacterIterator object.
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(CharacterIterator source) {
|
||||
return new CharacterIteratorWrapper(source);
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
*
|
||||
* @return length of the text
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getLength();
|
||||
|
||||
/**
|
||||
* Gets the current index in text.
|
||||
*
|
||||
* @return current index in text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getIndex();
|
||||
|
||||
/**
|
||||
* Returns the UTF16 code unit at index, and increments to the next code unit
|
||||
* (post-increment semantics). If index is out of range, DONE is returned, and
|
||||
* the iterator is reset to the limit of the text.
|
||||
*
|
||||
* @return the next UTF16 code unit, or DONE if the index is at the limit of the
|
||||
* text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int next();
|
||||
|
||||
/**
|
||||
* Returns the code point at index, and increments to the next code point
|
||||
* (post-increment semantics). If index does not point to a valid surrogate
|
||||
* pair, the behavior is the same as <code>next()</code>. Otherwise the iterator
|
||||
* is incremented past the surrogate pair, and the code point represented by the
|
||||
* pair is returned.
|
||||
*
|
||||
* @return the next codepoint in text, or DONE if the index is at the limit of
|
||||
* the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public int nextCodePoint() {
|
||||
int ch1 = next();
|
||||
if (UTF16.isLeadSurrogate((char) ch1)) {
|
||||
int ch2 = next();
|
||||
if (UTF16.isTrailSurrogate((char) ch2)) {
|
||||
return UCharacterProperty.getRawSupplementary((char) ch1, (char) ch2);
|
||||
} else if (ch2 != DONE) {
|
||||
// unmatched surrogate so back out
|
||||
previous();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decrement to the position of the previous code unit in the text, and return
|
||||
* it (pre-decrement semantics). If the resulting index is less than 0, the
|
||||
* index is reset to 0 and DONE is returned.
|
||||
*
|
||||
* @return the previous code unit in the text, or DONE if the new index is
|
||||
* before the start of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int previous();
|
||||
|
||||
/**
|
||||
* Retreat to the start of the previous code point in the text, and return it
|
||||
* (pre-decrement semantics). If the index is not preceeded by a valid surrogate
|
||||
* pair, the behavior is the same as <code>previous()</code>. Otherwise the
|
||||
* iterator is decremented to the start of the surrogate pair, and the code
|
||||
* point represented by the pair is returned.
|
||||
*
|
||||
* @return the previous code point in the text, or DONE if the new index is
|
||||
* before the start of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public int previousCodePoint() {
|
||||
int ch1 = previous();
|
||||
if (UTF16.isTrailSurrogate((char) ch1)) {
|
||||
int ch2 = previous();
|
||||
if (UTF16.isLeadSurrogate((char) ch2)) {
|
||||
return UCharacterProperty.getRawSupplementary((char) ch2, (char) ch1);
|
||||
} else if (ch2 != DONE) {
|
||||
// unmatched trail surrogate so back out
|
||||
next();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the index to the specified index in the text.
|
||||
*
|
||||
* @param index the index within the text.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract void setIndex(int index);
|
||||
|
||||
/**
|
||||
* Sets the current index to the start.
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public void setToStart() {
|
||||
setIndex(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the buffer with the underlying text storage of the iterator If the
|
||||
* buffer capacity is not enough a exception is thrown. The capacity of the fill
|
||||
* in buffer should at least be equal to length of text in the iterator obtained
|
||||
* by calling <code>getLength()</code>. <b>Usage:</b>
|
||||
*
|
||||
* <pre>{@code
|
||||
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
|
||||
* char[] buf = new char[iter.getLength()];
|
||||
* iter.getText(buf);
|
||||
*
|
||||
* OR
|
||||
* char[] buf= new char[1];
|
||||
* int len = 0;
|
||||
* for(;;){
|
||||
* try{
|
||||
* len = iter.getText(buf);
|
||||
* break;
|
||||
* }catch(IndexOutOfBoundsException e){
|
||||
* buf = new char[iter.getLength()];
|
||||
* }
|
||||
* }
|
||||
* }</pre>
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @param offset the position within the array to start putting the data.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBoundsException exception if there is not enough room
|
||||
* after offset in the array, or if offset
|
||||
* < 0.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getText(char[] fillIn, int offset);
|
||||
|
||||
/**
|
||||
* Convenience override for <code>getText(char[], int)</code> that provides an
|
||||
* offset of 0.
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBoundsException exception if there is not enough room in
|
||||
* the array.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public final int getText(char[] fillIn) {
|
||||
return getText(fillIn, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for returning the underlying text storage as a string
|
||||
*
|
||||
* @return the underlying text storage in the iterator as a string
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public String getText() {
|
||||
char[] text = new char[getLength()];
|
||||
getText(text);
|
||||
return new String(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by the number of code points specified, either
|
||||
* forward or backward depending on the sign of delta (positive or negative
|
||||
* respectively). If the current index is at a trail surrogate then the first
|
||||
* adjustment is by code unit, and the remaining adjustments are by code points.
|
||||
* If the resulting index would be less than zero, the index is set to zero, and
|
||||
* if the resulting index would be greater than limit, the index is set to
|
||||
* limit.
|
||||
*
|
||||
* @param delta the number of code units to move the current index.
|
||||
* @return the new index
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*
|
||||
*/
|
||||
public int moveCodePointIndex(int delta) {
|
||||
if (delta > 0) {
|
||||
while (delta > 0 && nextCodePoint() != DONE) {
|
||||
delta--;
|
||||
}
|
||||
} else {
|
||||
while (delta < 0 && previousCodePoint() != DONE) {
|
||||
delta++;
|
||||
}
|
||||
}
|
||||
if (delta != 0) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
|
||||
return getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, independent from other iterators. If it is
|
||||
* not possible to clone the iterator, returns null.
|
||||
*
|
||||
* @return copy of this iterator
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
}
|
609
sources/main/java/jdk_internal/icu/text/UTF16.java
Normal file
609
sources/main/java/jdk_internal/icu/text/UTF16.java
Normal file
@ -0,0 +1,609 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.icu.impl.UCharacterProperty;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Standalone utility class providing UTF16 character conversions and indexing
|
||||
* conversions.
|
||||
* <p>
|
||||
* Code that uses strings alone rarely need modification. By design, UTF-16 does
|
||||
* not allow overlap, so searching for strings is a safe operation. Similarly,
|
||||
* concatenation is always safe. Substringing is safe if the start and end are
|
||||
* both on UTF-32 boundaries. In normal code, the values for start and end are
|
||||
* on those boundaries, since they arose from operations like searching. If not,
|
||||
* the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
|
||||
* <strong>Examples:</strong>
|
||||
* <p>
|
||||
* The following examples illustrate use of some of these methods.
|
||||
*
|
||||
* <pre>{@code
|
||||
* // iteration forwards: Original
|
||||
* for (int i = 0; i < s.length(); ++i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration forwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s, i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Original
|
||||
* for (int i = s.length() - 1; i >= 0; --i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s, i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
* }</pre>
|
||||
*
|
||||
* <strong>Notes:</strong>
|
||||
* <ul>
|
||||
* <li><strong>Naming:</strong> For clarity, High and Low surrogates are called
|
||||
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
|
||||
* sense of their ordering in a string. <code>offset16</code> and
|
||||
* <code>offset32</code> are used to distinguish offsets to UTF-16 boundaries vs
|
||||
* offsets to UTF-32 boundaries. <code>int char32</code> is used to contain
|
||||
* UTF-32 characters, as opposed to <code>char16</code>, which is a UTF-16 code
|
||||
* unit.</li>
|
||||
* <li><strong>Roundtripping Offsets:</strong> You can always roundtrip from a
|
||||
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
|
||||
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and back
|
||||
* if and only if <code>bounds(string, offset16) != TRAIL</code>.</li>
|
||||
* <li><strong>Exceptions:</strong> The error checking will throw an exception
|
||||
* if indices are out of bounds. Other than that, all methods will behave
|
||||
* reasonably, even if unmatched surrogates or out-of-bounds UTF-32 values are
|
||||
* present. <code>UCharacter.isLegal()</code> can be used to check for validity
|
||||
* if desired.</li>
|
||||
* <li><strong>Unmatched Surrogates:</strong> If the string contains unmatched
|
||||
* surrogates, then these are counted as one UTF-32 value. This matches their
|
||||
* iteration behavior, which is vital. It also matches common display practice
|
||||
* as missing glyphs (see the Unicode Standard Section 5.4, 5.5).</li>
|
||||
* <li><strong>Optimization:</strong> The method implementations may need
|
||||
* optimization if the compiler doesn't fold static final methods. Since
|
||||
* surrogate pairs will form an exceeding small percentage of all the text in
|
||||
* the world, the singleton case should always be optimized for.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @author Mark Davis, with help from Markus Scherer
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
|
||||
public final class UTF16 {
|
||||
// public variables ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* The lowest Unicode code point value.
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MIN_VALUE = 0;
|
||||
/**
|
||||
* The highest Unicode code point value (scalar value) according to the Unicode
|
||||
* Standard.
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
|
||||
/**
|
||||
* The minimum value for Supplementary code points
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
|
||||
/**
|
||||
* Lead surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
|
||||
/**
|
||||
* Trail surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
|
||||
/**
|
||||
* Lead surrogate maximum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
|
||||
/**
|
||||
* Trail surrogate maximum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
|
||||
/**
|
||||
* Surrogate minimum value
|
||||
*
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
|
||||
/**
|
||||
* Lead surrogate bitmask
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
|
||||
/**
|
||||
* Trail surrogate bitmask
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
|
||||
/**
|
||||
* Surrogate bitmask
|
||||
*/
|
||||
private static final int SURROGATE_BITMASK = 0xFFFFF800;
|
||||
/**
|
||||
* Lead surrogate bits
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_BITS = 0xD800;
|
||||
/**
|
||||
* Trail surrogate bits
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
|
||||
/**
|
||||
* Surrogate bits
|
||||
*/
|
||||
private static final int SURROGATE_BITS = 0xD800;
|
||||
|
||||
// constructor --------------------------------------------------------
|
||||
|
||||
// /CLOVER:OFF
|
||||
/**
|
||||
* Prevent instance from being created.
|
||||
*/
|
||||
private UTF16() {
|
||||
}
|
||||
|
||||
// /CLOVER:ON
|
||||
// public method ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string. Used when iterating forwards or
|
||||
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">
|
||||
* UCharacter.isLegal()</a></code> on the return value. If the char retrieved is
|
||||
* part of a surrogate pair, its supplementary character will be returned. If a
|
||||
* complete supplementary character is not found the incomplete character will
|
||||
* be returned
|
||||
*
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(String source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (single < LEAD_SURROGATE_MIN_VALUE) {
|
||||
return single;
|
||||
}
|
||||
return _charAt(source, offset16, single);
|
||||
}
|
||||
|
||||
private static int _charAt(String source, int offset16, char single) {
|
||||
if (single > TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
++offset16;
|
||||
if (source.length() != offset16) {
|
||||
char trail = source.charAt(offset16);
|
||||
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
--offset16;
|
||||
if (offset16 >= 0) {
|
||||
// single is a trail surrogate so
|
||||
char lead = source.charAt(offset16);
|
||||
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
}
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string. Used when iterating forwards or
|
||||
* backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
|
||||
* </a></code> on the return value. If the char retrieved is part of a surrogate
|
||||
* pair, its supplementary character will be returned. If a complete
|
||||
* supplementary character is not found the incomplete character will be
|
||||
* returned
|
||||
*
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(CharSequence source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
return single;
|
||||
}
|
||||
return _charAt(source, offset16, single);
|
||||
}
|
||||
|
||||
private static int _charAt(CharSequence source, int offset16, char single) {
|
||||
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
|
||||
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
++offset16;
|
||||
if (source.length() != offset16) {
|
||||
char trail = source.charAt(offset16);
|
||||
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
--offset16;
|
||||
if (offset16 >= 0) {
|
||||
// single is a trail surrogate so
|
||||
char lead = source.charAt(offset16);
|
||||
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
}
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a substring. Used when iterating forwards
|
||||
* or backwards (with <code>UTF16.getCharCount()</code>, as well as random
|
||||
* access. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
|
||||
* </a></code> on the return value. If the char retrieved is part of a surrogate
|
||||
* pair, its supplementary character will be returned. If a complete
|
||||
* supplementary character is not found the incomplete character will be
|
||||
* returned
|
||||
*
|
||||
* @param source Array of UTF-16 chars
|
||||
* @param start Offset to substring in the source array for analyzing
|
||||
* @param limit Offset to substring in the source array for analyzing
|
||||
* @param offset16 UTF-16 offset relative to start
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
|
||||
* The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the
|
||||
* range of start and limit.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(char source[], int start, int limit, int offset16) {
|
||||
offset16 += start;
|
||||
if (offset16 < start || offset16 >= limit) {
|
||||
throw new ArrayIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
|
||||
char single = source[offset16];
|
||||
if (!isSurrogate(single)) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
offset16++;
|
||||
if (offset16 >= limit) {
|
||||
return single;
|
||||
}
|
||||
char trail = source[offset16];
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
} else { // isTrailSurrogate(single), so
|
||||
if (offset16 == start) {
|
||||
return single;
|
||||
}
|
||||
offset16--;
|
||||
char lead = source[offset16];
|
||||
if (isLeadSurrogate(lead))
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how many chars this char32 requires. If a validity check is
|
||||
* required, use <code>
|
||||
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input codepoint.
|
||||
* @return 2 if is in supplementary space, otherwise 1.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCharCount(int char32) {
|
||||
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return 1;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code value is a surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isSurrogate(char char16) {
|
||||
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a trail surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a trail surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isTrailSurrogate(char char16) {
|
||||
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a lead surrogate.
|
||||
*
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a lead surrogate
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isLeadSurrogate(char char16) {
|
||||
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the lead surrogate. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return lead surrogate if the getCharCount(ch) is 2; <br>
|
||||
* and 0 otherwise (note: 0 is not a valid lead surrogate).
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getLeadSurrogate(int char32) {
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the trail surrogate. If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return the trail surrogate if the getCharCount(ch) is 2; <br>
|
||||
* otherwise the character itself
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getTrailSurrogate(int char32) {
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
return (char) char32;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method corresponding to String.valueOf(char). Returns a one or
|
||||
* two char string containing the UTF-32 value in UTF16 format. If a validity
|
||||
* check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param char32 the input character.
|
||||
* @return string value of char32 in UTF16 format
|
||||
* @exception IllegalArgumentException thrown if char32 is a invalid codepoint.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static String valueOf(int char32) {
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint");
|
||||
}
|
||||
return toString(char32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a single UTF-32 value to the end of a StringBuffer. If a validity
|
||||
* check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
*
|
||||
* @param target the buffer to append to
|
||||
* @param char32 value to append.
|
||||
* @return the updated StringBuffer
|
||||
* @exception IllegalArgumentException thrown when char32 does not lie within
|
||||
* the range of the Unicode codepoints
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static StringBuffer append(StringBuffer target, int char32) {
|
||||
// Check for irregular values
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
|
||||
}
|
||||
|
||||
// Write the UTF-16 values
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
target.append(getLeadSurrogate(char32));
|
||||
target.append(getTrailSurrogate(char32));
|
||||
} else {
|
||||
target.append((char) char32);
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shifts offset16 by the argument number of codepoints within a subarray.
|
||||
*
|
||||
* @param source char array
|
||||
* @param start position of the subarray to be performed on
|
||||
* @param limit position of the subarray to be performed on
|
||||
* @param offset16 UTF16 position to shift relative to start
|
||||
* @param shift32 number of codepoints to shift
|
||||
* @return new shifted offset16 relative to start
|
||||
* @exception IndexOutOfBoundsException if the new offset16 is out of bounds
|
||||
* with respect to the subarray or the
|
||||
* subarray bounds are out of range.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) {
|
||||
int size = source.length;
|
||||
int count;
|
||||
char ch;
|
||||
int result = offset16 + start;
|
||||
if (start < 0 || limit < start) {
|
||||
throw new StringIndexOutOfBoundsException(start);
|
||||
}
|
||||
if (limit > size) {
|
||||
throw new StringIndexOutOfBoundsException(limit);
|
||||
}
|
||||
if (offset16 < 0 || result > limit) {
|
||||
throw new StringIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
if (shift32 > 0) {
|
||||
if (shift32 + result > size) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
count = shift32;
|
||||
while (result < limit && count > 0) {
|
||||
ch = source[result];
|
||||
if (isLeadSurrogate(ch) && (result + 1 < limit) && isTrailSurrogate(source[result + 1])) {
|
||||
result++;
|
||||
}
|
||||
count--;
|
||||
result++;
|
||||
}
|
||||
} else {
|
||||
if (result + shift32 < start) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
for (count = -shift32; count > 0; count--) {
|
||||
result--;
|
||||
if (result < start) {
|
||||
break;
|
||||
}
|
||||
ch = source[result];
|
||||
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
|
||||
result--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
throw new StringIndexOutOfBoundsException(shift32);
|
||||
}
|
||||
result -= start;
|
||||
return result;
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Shift value for lead surrogate to form a supplementary character.
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_SHIFT_ = 10;
|
||||
|
||||
/**
|
||||
* Mask to retrieve the significant value from a trail surrogate.
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
|
||||
|
||||
/**
|
||||
* Value that all lead surrogate starts with
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
|
||||
- (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
|
||||
|
||||
// private methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Converts argument code point and returns a String object representing the
|
||||
* code point's value in UTF16 format.
|
||||
* <p>
|
||||
* This method does not check for the validity of the codepoint, the results are
|
||||
* not guaranteed if a invalid codepoint is passed as argument.
|
||||
* <p>
|
||||
* The result is a string whose length is 1 for non-supplementary code points, 2
|
||||
* otherwise.
|
||||
*
|
||||
* @param ch code point
|
||||
* @return string representation of the code point
|
||||
*/
|
||||
private static String toString(int ch) {
|
||||
if (ch < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return String.valueOf((char) ch);
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder();
|
||||
result.append(getLeadSurrogate(ch));
|
||||
result.append(getTrailSurrogate(ch));
|
||||
return result.toString();
|
||||
}
|
||||
}
|
1515
sources/main/java/jdk_internal/icu/text/UnicodeSet.java
Normal file
1515
sources/main/java/jdk_internal/icu/text/UnicodeSet.java
Normal file
File diff suppressed because it is too large
Load Diff
498
sources/main/java/jdk_internal/icu/util/CodePointMap.java
Normal file
498
sources/main/java/jdk_internal/icu/util/CodePointMap.java
Normal file
@ -0,0 +1,498 @@
|
||||
/*
|
||||
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
// (c) 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
// created: 2018may10 Markus W. Scherer
|
||||
|
||||
package jdk_internal.icu.util;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
|
||||
* This does not implement java.util.Map.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
|
||||
/**
|
||||
* Selectors for how getRange() should report value ranges overlapping with
|
||||
* surrogates. Most users should use NORMAL.
|
||||
*
|
||||
* @see #getRange
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public enum RangeOption {
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map. Most users
|
||||
* should use this option.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
NORMAL,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map, except that
|
||||
* lead surrogates (U+D800..U+DBFF) are treated as having the surrogateValue,
|
||||
* which is passed to getRange() as a separate parameter. The surrogateValue is
|
||||
* not transformed via filter(). See {@link Character#isHighSurrogate}.
|
||||
*
|
||||
* <p>
|
||||
* Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>
|
||||
* This option is useful for maps that map surrogate code *units* to special
|
||||
* values optimized for UTF-16 string processing or for special error behavior
|
||||
* for unpaired surrogates, but those values are not to be associated with the
|
||||
* lead surrogate code *points*.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
FIXED_LEAD_SURROGATES,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map, except that
|
||||
* all surrogates (U+D800..U+DFFF) are treated as having the surrogateValue,
|
||||
* which is passed to getRange() as a separate parameter. The surrogateValue is
|
||||
* not transformed via filter(). See {@link Character#isSurrogate}.
|
||||
*
|
||||
* <p>
|
||||
* Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>
|
||||
* This option is useful for maps that map surrogate code *units* to special
|
||||
* values optimized for UTF-16 string processing or for special error behavior
|
||||
* for unpaired surrogates, but those values are not to be associated with the
|
||||
* lead surrogate code *points*.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
FIXED_ALL_SURROGATES
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function interface: Modifies a map value. Optionally called by
|
||||
* getRange(). The modified value will be returned by the getRange() function.
|
||||
*
|
||||
* <p>
|
||||
* Can be used to ignore some of the value bits, make a filter for one of
|
||||
* several values, return a value index computed from the map value, etc.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public interface ValueFilter {
|
||||
/**
|
||||
* Modifies the map value.
|
||||
*
|
||||
* @param value map value
|
||||
* @return modified value
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public int apply(int value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Range iteration result data. Code points from start to end map to the same
|
||||
* value. The value may have been modified by {@link ValueFilter#apply(int)}, or
|
||||
* it may be the surrogateValue if a RangeOption other than "normal" was used.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public static final class Range {
|
||||
private int start;
|
||||
private int end;
|
||||
private int value;
|
||||
|
||||
/**
|
||||
* Constructor. Sets start and end to -1 and value to 0.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public Range() {
|
||||
start = end = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start code point
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public int getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the (inclusive) end code point
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the range value
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public int getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the range. When using {@link #iterator()}, iteration will resume after
|
||||
* the newly set end.
|
||||
*
|
||||
* @param start new start code point
|
||||
* @param end new end code point
|
||||
* @param value new value
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public void set(int start, int end, int value) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
private final class RangeIterator implements Iterator<Range> {
|
||||
private Range range = new Range();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return -1 <= range.end && range.end < 0x10ffff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Range next() {
|
||||
if (getRange(range.end + 1, null, range)) {
|
||||
return range;
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates over code points of a string and fetches map values. This does not
|
||||
* implement java.util.Iterator.
|
||||
*
|
||||
* <pre>
|
||||
* void onString(CodePointMap map, CharSequence s, int start) {
|
||||
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
|
||||
* while (iter.next()) {
|
||||
* int end = iter.getIndex(); // code point from between start and end
|
||||
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
|
||||
* start = end;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* This class is not intended for public subclassing.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public class StringIterator {
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected CharSequence s;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int sIndex;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int c;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int value;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected StringIterator(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator to a new string and/or a new string index.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public void reset(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next code point, post-increments the string index, and gets a value
|
||||
* from the map. Sets an implementation-defined error value if the code point is
|
||||
* an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the end of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public boolean next() {
|
||||
if (sIndex >= s.length()) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointAt(s, sIndex);
|
||||
sIndex += Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the previous code point, pre-decrements the string index, and gets a
|
||||
* value from the map. Sets an implementation-defined error value if the code
|
||||
* point is an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the start of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public boolean previous() {
|
||||
if (sIndex <= 0) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointBefore(s, sIndex);
|
||||
sIndex -= Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the string index
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public final int getIndex() {
|
||||
return sIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the code point
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public final int getCodePoint() {
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the map value, or an implementation-defined error value if the code
|
||||
* point is an unpaired surrogate
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public final int getValue() {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Protected no-args constructor.
|
||||
*
|
||||
* @stable ICU 63
|
||||
*/
|
||||
protected CodePointMap() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value for a code point as stored in the map, with range checking.
|
||||
* Returns an implementation-defined error value if c is not in the range
|
||||
* 0..U+10FFFF.
|
||||
*
|
||||
* @param c the code point
|
||||
* @return the map value, or an implementation-defined error value if the code
|
||||
* point is not in the range 0..U+10FFFF
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public abstract int get(int c);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start
|
||||
* parameter. The range start is the same as the start input parameter (even if
|
||||
* there are preceding code points that have the same value). The range end is
|
||||
* the last code point such that all those from start to there have the same
|
||||
* value. Returns false if start is not 0..U+10FFFF. Can be used to efficiently
|
||||
* iterate over all same-value ranges in a map. (This is normally faster than
|
||||
* iterating over code points and get()ting each value, but may be much slower
|
||||
* than a data structure that stores ranges directly.)
|
||||
*
|
||||
* <p>
|
||||
* If the {@link ValueFilter} parameter is not null, then the value to be
|
||||
* delivered is passed through that filter, and the return value is the end of
|
||||
* the range where all values are modified to the same actual value. The value
|
||||
* is unchanged if that parameter is null.
|
||||
*
|
||||
* <p>
|
||||
* Example:
|
||||
*
|
||||
* <pre>
|
||||
* int start = 0;
|
||||
* CodePointMap.Range range = new CodePointMap.Range();
|
||||
* while (map.getRange(start, null, range)) {
|
||||
* int end = range.getEnd();
|
||||
* int value = range.getValue();
|
||||
* // Work with the range start..end and its value.
|
||||
* start = end + 1;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param start range start
|
||||
* @param filter an object that may modify the map data value, or null if the
|
||||
* values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point range and
|
||||
* value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public abstract boolean getRange(int start, ValueFilter filter, Range range);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start
|
||||
* parameter. The range start is the same as the start input parameter (even if
|
||||
* there are preceding code points that have the same value). The range end is
|
||||
* the last code point such that all those from start to there have the same
|
||||
* value. Returns false if start is not 0..U+10FFFF.
|
||||
*
|
||||
* <p>
|
||||
* Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
|
||||
* modifies the range if it overlaps with surrogate code points.
|
||||
*
|
||||
* @param start range start
|
||||
* @param option defines whether surrogates are treated normally, or as
|
||||
* having the surrogateValue; usually
|
||||
* {@link RangeOption#NORMAL}
|
||||
* @param surrogateValue value for surrogates; ignored if
|
||||
* option=={@link RangeOption#NORMAL}
|
||||
* @param filter an object that may modify the map data value, or null
|
||||
* if the values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point
|
||||
* range and value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public boolean getRange(int start, RangeOption option, int surrogateValue, ValueFilter filter, Range range) {
|
||||
assert option != null;
|
||||
if (!getRange(start, filter, range)) {
|
||||
return false;
|
||||
}
|
||||
if (option == RangeOption.NORMAL) {
|
||||
return true;
|
||||
}
|
||||
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
|
||||
int end = range.end;
|
||||
if (end < 0xd7ff || start > surrEnd) {
|
||||
return true;
|
||||
}
|
||||
// The range overlaps with surrogates, or ends just before the first one.
|
||||
if (range.value == surrogateValue) {
|
||||
if (end >= surrEnd) {
|
||||
// Surrogates followed by a non-surrValue range,
|
||||
// or surrogates are part of a larger surrValue range.
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (start <= 0xd7ff) {
|
||||
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
|
||||
return true;
|
||||
}
|
||||
// Start is a surrogate with a non-surrValue code *unit* value.
|
||||
// Return a surrValue code *point* range.
|
||||
range.value = surrogateValue;
|
||||
if (end > surrEnd) {
|
||||
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// See if the surrValue surrogate range can be merged with
|
||||
// an immediately following range.
|
||||
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
|
||||
range.start = start;
|
||||
return true;
|
||||
}
|
||||
range.start = start;
|
||||
range.end = surrEnd;
|
||||
range.value = surrogateValue;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience iterator over same-map-value code point ranges. Same as looping
|
||||
* over all ranges with {@link #getRange(int, ValueFilter, Range)} without
|
||||
* filtering. Adjacent ranges have different map values.
|
||||
*
|
||||
* <p>
|
||||
* The iterator always returns the same Range object.
|
||||
*
|
||||
* @return a Range iterator
|
||||
* @stable ICU 63
|
||||
*/
|
||||
@Override
|
||||
public Iterator<Range> iterator() {
|
||||
return new RangeIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator (not a java.util.Iterator) over code points of a string
|
||||
* for fetching map values.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @return the iterator
|
||||
* @stable ICU 63
|
||||
*/
|
||||
public StringIterator stringIterator(CharSequence s, int sIndex) {
|
||||
return new StringIterator(s, sIndex);
|
||||
}
|
||||
}
|
1357
sources/main/java/jdk_internal/icu/util/CodePointTrie.java
Normal file
1357
sources/main/java/jdk_internal/icu/util/CodePointTrie.java
Normal file
File diff suppressed because it is too large
Load Diff
48
sources/main/java/jdk_internal/icu/util/OutputInt.java
Normal file
48
sources/main/java/jdk_internal/icu/util/OutputInt.java
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.util;
|
||||
|
||||
/**
|
||||
* Simple struct-like class for int output parameters. Like
|
||||
* <code>Output<Integer></code> but without auto-boxing.
|
||||
*
|
||||
* @internal but could become public deprecated This API is ICU internal only.
|
||||
*/
|
||||
public class OutputInt {
|
||||
|
||||
/**
|
||||
* The value field.
|
||||
*
|
||||
* @internal deprecated This API is ICU internal only.
|
||||
*/
|
||||
public int value;
|
||||
}
|
191
sources/main/java/jdk_internal/icu/util/VersionInfo.java
Normal file
191
sources/main/java/jdk_internal/icu/util/VersionInfo.java
Normal file
@ -0,0 +1,191 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package jdk_internal.icu.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Class to store version numbers of the form major.minor.milli.micro.
|
||||
*
|
||||
* @author synwee
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public final class VersionInfo {
|
||||
// public data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Data version string for ICU's internal data. Used for appending to data path
|
||||
* (e.g. icudt43b)
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final String ICU_DATA_VERSION_PATH = "67b";
|
||||
|
||||
// public methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns an instance of VersionInfo with the argument version.
|
||||
*
|
||||
* @param version version String in the format of "major.minor.milli.micro" or
|
||||
* "major.minor.milli" or "major.minor" or "major", where major,
|
||||
* minor, milli, micro are non-negative numbers {@literal <=}
|
||||
* 255. If the trailing version numbers are not specified they
|
||||
* are taken as 0s. E.g. Version "3.1" is equivalent to
|
||||
* "3.1.0.0".
|
||||
* @return an instance of VersionInfo with the argument version.
|
||||
* @exception throws an IllegalArgumentException when the argument version is
|
||||
* not in the right format
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getInstance(String version) {
|
||||
int length = version.length();
|
||||
int array[] = { 0, 0, 0, 0 };
|
||||
int count = 0;
|
||||
int index = 0;
|
||||
|
||||
while (count < 4 && index < length) {
|
||||
char c = version.charAt(index);
|
||||
if (c == '.') {
|
||||
count++;
|
||||
} else {
|
||||
c -= '0';
|
||||
if (c < 0 || c > 9) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
array[count] *= 10;
|
||||
array[count] += c;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
if (index != length) {
|
||||
throw new IllegalArgumentException(
|
||||
"Invalid version number: String '" + version + "' exceeds version format");
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (array[i] < 0 || array[i] > 255) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
}
|
||||
|
||||
return getInstance(array[0], array[1], array[2], array[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance of VersionInfo with the argument version.
|
||||
*
|
||||
* @param major major version, non-negative number {@literal <=} 255.
|
||||
* @param minor minor version, non-negative number {@literal <=} 255.
|
||||
* @param milli milli version, non-negative number {@literal <=} 255.
|
||||
* @param micro micro version, non-negative number {@literal <=} 255.
|
||||
* @exception throws an IllegalArgumentException when either arguments are
|
||||
* negative or {@literal >} 255
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getInstance(int major, int minor, int milli, int micro) {
|
||||
// checks if it is in the hashmap
|
||||
// else
|
||||
if (major < 0 || major > 255 || minor < 0 || minor > 255 || milli < 0 || milli > 255 || micro < 0
|
||||
|| micro > 255) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
int version = getInt(major, minor, milli, micro);
|
||||
Integer key = Integer.valueOf(version);
|
||||
Object result = MAP_.get(key);
|
||||
if (result == null) {
|
||||
result = new VersionInfo(version);
|
||||
MAP_.put(key, result);
|
||||
}
|
||||
return (VersionInfo) result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares other with this VersionInfo.
|
||||
*
|
||||
* @param other VersionInfo to be compared
|
||||
* @return 0 if the argument is a VersionInfo object that has version
|
||||
* information equal to this object. Less than 0 if the argument is a
|
||||
* VersionInfo object that has version information greater than this
|
||||
* object. Greater than 0 if the argument is a VersionInfo object that
|
||||
* has version information less than this object.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public int compareTo(VersionInfo other) {
|
||||
return m_version_ - other.m_version_;
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Version number stored as a byte for each of the major, minor, milli and micro
|
||||
* numbers in the 32 bit int. Most significant for the major and the least
|
||||
* significant contains the micro numbers.
|
||||
*/
|
||||
private int m_version_;
|
||||
/**
|
||||
* Map of singletons
|
||||
*/
|
||||
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
|
||||
/**
|
||||
* Error statement string
|
||||
*/
|
||||
private static final String INVALID_VERSION_NUMBER_ = "Invalid version number: Version number may be negative or greater than 255";
|
||||
|
||||
// private constructor -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructor with int
|
||||
*
|
||||
* @param compactversion a 32 bit int with each byte representing a number
|
||||
*/
|
||||
private VersionInfo(int compactversion) {
|
||||
m_version_ = compactversion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the int from the version numbers
|
||||
*
|
||||
* @param major non-negative version number
|
||||
* @param minor non-negativeversion number
|
||||
* @param milli non-negativeversion number
|
||||
* @param micro non-negativeversion number
|
||||
*/
|
||||
private static int getInt(int major, int minor, int milli, int micro) {
|
||||
return (major << 24) | (minor << 16) | (milli << 8) | micro;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user