mirror of
https://github.com/Eaglercraft-Archive/Eaglercraftx-1.8.8-src.git
synced 2025-06-28 02:48:14 -05:00
Update #48 - Added some features from OptiFine
This commit is contained in:
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
791
sources/main/java/jdk_internal/icu/text/NormalizerBase.java
Normal file
@ -0,0 +1,791 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2000-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package jdk_internal.icu.text;
|
||||
|
||||
import jdk_internal.bidi.CharacterIterator;
|
||||
import jdk_internal.bidi.Normalizer;
|
||||
import jdk_internal.icu.impl.Norm2AllModes;
|
||||
|
||||
/**
|
||||
* Unicode Normalization
|
||||
*
|
||||
* <h2>Unicode normalization API</h2>
|
||||
*
|
||||
* <code>normalize</code> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <code>normalize</code> supports the standard normalization forms described in
|
||||
* <a href="http://www.unicode.org/reports/tr15/" target="unicode"> Unicode
|
||||
* Standard Annex #15 — Unicode Normalization Forms</a>.
|
||||
*
|
||||
* Characters with accents or other adornments can be encoded in several
|
||||
* different ways in Unicode. For example, take the character A-acute. In
|
||||
* Unicode, this can be encoded as a single character (the "composed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
||||
* </pre>
|
||||
*
|
||||
* or as two separate characters (the "decomposed" form):
|
||||
*
|
||||
* <pre>
|
||||
* 0041 LATIN CAPITAL LETTER A
|
||||
* 0301 COMBINING ACUTE ACCENT
|
||||
* </pre>
|
||||
*
|
||||
* To a user of your program, however, both of these sequences should be treated
|
||||
* as the same "user-level" character "A with acute accent". When you are
|
||||
* searching or comparing text, you must ensure that these two sequences are
|
||||
* treated equivalently. In addition, you must handle characters with more than
|
||||
* one accent. Sometimes the order of a character's combining accents is
|
||||
* significant, while in other cases accent sequences in different orders are
|
||||
* really equivalent.
|
||||
*
|
||||
* Similarly, the string "ffi" can be encoded as three separate letters:
|
||||
*
|
||||
* <pre>
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0066 LATIN SMALL LETTER F
|
||||
* 0069 LATIN SMALL LETTER I
|
||||
* </pre>
|
||||
*
|
||||
* or as the single character
|
||||
*
|
||||
* <pre>
|
||||
* FB03 LATIN SMALL LIGATURE FFI
|
||||
* </pre>
|
||||
*
|
||||
* The ffi ligature is not a distinct semantic character, and strictly speaking
|
||||
* it shouldn't be in Unicode at all, but it was included for compatibility with
|
||||
* existing character sets that already provided it. The Unicode standard
|
||||
* identifies such characters by giving them "compatibility" decompositions into
|
||||
* the corresponding semantic characters. When sorting and searching, you will
|
||||
* often want to use these mappings.
|
||||
*
|
||||
* <code>normalize</code> helps solve these problems by transforming text into
|
||||
* the canonical composed and decomposed forms as shown in the first example
|
||||
* above. In addition, you can have it perform compatibility decompositions so
|
||||
* that you can treat compatibility characters the same as their equivalents.
|
||||
* Finally, <code>normalize</code> rearranges accents into the proper canonical
|
||||
* order, so that you do not have to worry about accent rearrangement on your
|
||||
* own.
|
||||
*
|
||||
* Form FCD, "Fast C or D", is also designed for collation. It allows to work on
|
||||
* strings that are not necessarily normalized with an algorithm (like in
|
||||
* collation) that works under "canonical closure", i.e., it treats precomposed
|
||||
* characters and their decomposed equivalents the same.
|
||||
*
|
||||
* It is not a normalization form because it does not provide for uniqueness of
|
||||
* representation. Multiple strings may be canonically equivalent (their NFDs
|
||||
* are identical) and may all conform to FCD without being identical themselves.
|
||||
*
|
||||
* The form is defined such that the "raw decomposition", the recursive
|
||||
* canonical decomposition of each character, results in a string that is
|
||||
* canonically ordered. This means that precomposed characters are allowed for
|
||||
* as long as their decompositions do not need canonical reordering.
|
||||
*
|
||||
* Its advantage for a process like collation is that all NFD and most NFC texts
|
||||
* - and many unnormalized texts - already conform to FCD and do not need to be
|
||||
* normalized (NFD) for such a process. The FCD quick check will return YES for
|
||||
* most strings in practice.
|
||||
*
|
||||
* normalize(FCD) may be implemented with NFD.
|
||||
*
|
||||
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence
|
||||
* in Applications): http://www.unicode.org/notes/tn5/#FCD
|
||||
*
|
||||
* ICU collation performs either NFD or FCD normalization automatically if
|
||||
* normalization is turned on for the collator object. Beyond collation and
|
||||
* string search, normalized strings may be useful for string equivalence
|
||||
* comparisons, transliteration/transcription, unique representations, etc.
|
||||
*
|
||||
* The W3C generally recommends to exchange texts in NFC. Note also that most
|
||||
* legacy character encodings use only precomposed forms and often do not encode
|
||||
* any combining marks by themselves. For conversion to such character encodings
|
||||
* the Unicode text needs to be normalized to NFC. For more usage examples, see
|
||||
* the Unicode Standard Annex.
|
||||
*
|
||||
* Note: The Normalizer class also provides API for iterative normalization.
|
||||
* While the setIndex() and getIndex() refer to indices in the underlying
|
||||
* Unicode input text, the next() and previous() methods iterate through
|
||||
* characters in the normalized output. This means that there is not necessarily
|
||||
* a one-to-one correspondence between characters returned by next() and
|
||||
* previous() and the indices passed to and returned from setIndex() and
|
||||
* getIndex(). It is for this reason that Normalizer does not implement the
|
||||
* CharacterIterator interface.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// Original filename in ICU4J: Normalizer.java
|
||||
public final class NormalizerBase implements Cloneable {
|
||||
|
||||
// The input text and our position in it
|
||||
private UCharacterIterator text;
|
||||
private Normalizer2 norm2;
|
||||
private Mode mode;
|
||||
private int options;
|
||||
|
||||
// The normalization buffer is the result of normalization
|
||||
// of the source in [currentIndex..nextIndex] .
|
||||
private int currentIndex;
|
||||
private int nextIndex;
|
||||
|
||||
// A buffer for holding intermediate results
|
||||
private StringBuilder buffer;
|
||||
private int bufferPos;
|
||||
|
||||
// Helper classes to defer loading of normalization data.
|
||||
private static final class ModeImpl {
|
||||
private ModeImpl(Normalizer2 n2) {
|
||||
normalizer2 = n2;
|
||||
}
|
||||
|
||||
private final Normalizer2 normalizer2;
|
||||
}
|
||||
|
||||
private static final class NFDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
|
||||
}
|
||||
|
||||
private static final class NFKDModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
|
||||
}
|
||||
|
||||
private static final class NFCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
|
||||
}
|
||||
|
||||
private static final class NFKCModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
|
||||
}
|
||||
|
||||
private static final class Unicode32 {
|
||||
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
|
||||
}
|
||||
|
||||
private static final class NFD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKD32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKDInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
private static final class NFKC32ModeImpl {
|
||||
private static final ModeImpl INSTANCE = new ModeImpl(
|
||||
new FilteredNormalizer2(Normalizer2.getNFKCInstance(), Unicode32.INSTANCE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Options bit set value to select Unicode 3.2 normalization (except
|
||||
* NormalizationCorrections). At most one Unicode version can be selected at a
|
||||
* time.
|
||||
*
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final int UNICODE_3_2 = 0x20;
|
||||
|
||||
public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2;
|
||||
|
||||
/*
|
||||
* Default option for the latest Unicode normalization. This option is provided
|
||||
* mainly for testing. The value zero means that normalization is done with the
|
||||
* fixes for - Corrigendum 4 (Five CJK Canonical Mapping Errors) - Corrigendum 5
|
||||
* (Normalization Idempotency)
|
||||
*/
|
||||
public static final int UNICODE_LATEST = 0x00;
|
||||
|
||||
/**
|
||||
* Constant indicating that the end of the iteration has been reached. This is
|
||||
* guaranteed to have the same value as {@link UCharacterIterator#DONE}.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int DONE = UCharacterIterator.DONE;
|
||||
|
||||
/**
|
||||
* Constants for normalization modes.
|
||||
* <p>
|
||||
* The Mode class is not intended for public subclassing. Only the Mode
|
||||
* constants provided by the Normalizer class should be used, and any fields or
|
||||
* methods should not be called or overridden by users.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public abstract static class Mode {
|
||||
|
||||
/**
|
||||
* Sole constructor
|
||||
*
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected Mode() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected abstract Normalizer2 getNormalizer2(int options);
|
||||
}
|
||||
|
||||
private static Mode toMode(Normalizer.Form form) {
|
||||
switch (form) {
|
||||
case NFC:
|
||||
return NFC;
|
||||
case NFD:
|
||||
return NFD;
|
||||
case NFKC:
|
||||
return NFKC;
|
||||
case NFKD:
|
||||
return NFKD;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unexpected normalization form: " + form);
|
||||
}
|
||||
|
||||
private static final class NONEMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return Norm2AllModes.NOOP_NORMALIZER2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKDMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKD32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKDModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class NFKCMode extends Mode {
|
||||
protected Normalizer2 getNormalizer2(int options) {
|
||||
return (options & UNICODE_3_2) != 0 ? NFKC32ModeImpl.INSTANCE.normalizer2
|
||||
: NFKCModeImpl.INSTANCE.normalizer2;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* No decomposition/composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NONE = new NONEMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFD = new NFDMode();
|
||||
|
||||
/**
|
||||
* Compatibility decomposition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFKD = new NFKDMode();
|
||||
|
||||
/**
|
||||
* Canonical decomposition followed by canonical composition.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final Mode NFC = new NFCMode();
|
||||
|
||||
public static final Mode NFKC = new NFKCMode();
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator constructors
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of a given string.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this object.
|
||||
* <p>
|
||||
*
|
||||
* @param str The string to be normalized. The normalization will start at the
|
||||
* beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(String str, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance(str);
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(String str, Mode mode) {
|
||||
this(str, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@code NormalizerBase} object for iterating over the normalized
|
||||
* form of the given text.
|
||||
* <p>
|
||||
*
|
||||
* @param iter The input text to be normalized. The normalization will start at
|
||||
* the beginning of the string.
|
||||
*
|
||||
* @param mode The normalization mode.
|
||||
*
|
||||
* @param opt Any optional features to be enabled. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms,
|
||||
* use 0 for this argument.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
|
||||
this.text = UCharacterIterator.getInstance((CharacterIterator) iter.clone());
|
||||
this.mode = mode;
|
||||
this.options = opt;
|
||||
norm2 = mode.getNormalizer2(opt);
|
||||
buffer = new StringBuilder();
|
||||
}
|
||||
|
||||
public NormalizerBase(CharacterIterator iter, Mode mode) {
|
||||
this(iter, mode, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clones this {@code NormalizerBase} object. All properties of this object are
|
||||
* duplicated in the new object, including the cloning of any
|
||||
* {@link CharacterIterator} that was passed in to the constructor or to
|
||||
* {@link #setText(CharacterIterator) setText}. However, the text storage
|
||||
* underlying the {@code CharacterIterator} is not duplicated unless the
|
||||
* iterator's {@code clone} method does so.
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Object clone() {
|
||||
try {
|
||||
NormalizerBase copy = (NormalizerBase) super.clone();
|
||||
copy.text = (UCharacterIterator) text.clone();
|
||||
copy.mode = mode;
|
||||
copy.options = options;
|
||||
copy.norm2 = norm2;
|
||||
copy.buffer = new StringBuilder(buffer);
|
||||
copy.bufferPos = bufferPos;
|
||||
copy.currentIndex = currentIndex;
|
||||
copy.nextIndex = nextIndex;
|
||||
return copy;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new InternalError(e.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes a {@code String} using the given normalization operation.
|
||||
* <p>
|
||||
* The {@code options} parameter specifies which optional {@code NormalizerBase}
|
||||
* features are to be enabled for this operation. Currently the only available
|
||||
* option is {@link #UNICODE_3_2}. If you want the default behavior
|
||||
* corresponding to one of the standard Unicode Normalization Forms, use 0 for
|
||||
* this argument.
|
||||
* <p>
|
||||
*
|
||||
* @param str the input string to be normalized.
|
||||
* @param mode the normalization mode
|
||||
* @param options the optional features to be enabled.
|
||||
* @return String the normalized string
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static String normalize(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).normalize(str);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static String normalize(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.normalize(str, toMode(form), options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test if a string is in a given normalization form. This is semantically
|
||||
* equivalent to source.equals(normalize(source, mode)).
|
||||
*
|
||||
* Unlike quickCheck(), this function returns a definitive result, never a
|
||||
* "maybe". For NFD, NFKD, and FCD, both functions work exactly the same. For
|
||||
* NFC and NFKC where quickCheck may return "maybe", this function will perform
|
||||
* further tests to arrive at a true/false result.
|
||||
*
|
||||
* @param str the input string to be checked to see if it is normalized
|
||||
* @param mode the normalization mode
|
||||
* @param options Options for use with exclusion set and tailored Normalization
|
||||
* The only option that is currently recognized is UNICODE_3_2
|
||||
* @see #isNormalized
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static boolean isNormalized(String str, Mode mode, int options) {
|
||||
return mode.getNormalizer2(options).isNormalized(str);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
|
||||
}
|
||||
|
||||
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
|
||||
return NormalizerBase.isNormalized(str, toMode(form), options);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iteration API
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return the current character in the normalized text.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int current() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
return buffer.codePointAt(bufferPos);
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the next character in the normalized text and advance the iteration
|
||||
* position by one. If the end of the text has already been reached,
|
||||
* {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int next() {
|
||||
if (bufferPos < buffer.length() || nextNormalize()) {
|
||||
int c = buffer.codePointAt(bufferPos);
|
||||
bufferPos += Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the previous character in the normalized text and decrement the
|
||||
* iteration position by one. If the beginning of the text has already been
|
||||
* reached, {@link #DONE} is returned.
|
||||
*
|
||||
* @return The codepoint as an int
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int previous() {
|
||||
if (bufferPos > 0 || previousNormalize()) {
|
||||
int c = buffer.codePointBefore(bufferPos);
|
||||
bufferPos -= Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
return DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the index to the beginning of the text. This is equivalent to
|
||||
* setIndexOnly(startIndex)).
|
||||
*
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void reset() {
|
||||
text.setIndex(0);
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized,
|
||||
* without any immediate normalization. After setIndexOnly(), getIndex() will
|
||||
* return the same index that is specified here.
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setIndexOnly(int index) {
|
||||
text.setIndex(index); // validates index
|
||||
currentIndex = nextIndex = index;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the iteration position in the input text that is being normalized and
|
||||
* return the first normalized character at that position.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em> text, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* normalized <em>output</em>. This means that there is not necessarily a
|
||||
* one-to-one correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
* <p>
|
||||
*
|
||||
* @param index the desired index in the input text.
|
||||
*
|
||||
* @return the first normalized character that is the result of iterating
|
||||
* forward starting at the given index.
|
||||
*
|
||||
* @throws IllegalArgumentException if the given index is less than
|
||||
* {@link #getBeginIndex} or greater than
|
||||
* {@link #getEndIndex}. deprecated ICU 3.2
|
||||
* @obsolete ICU 3.2
|
||||
*/
|
||||
public int setIndex(int index) {
|
||||
setIndexOnly(index);
|
||||
return current();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the start of the input text. This is the begin index of
|
||||
* the {@code CharacterIterator} or the start (i.e. 0) of the {@code String}
|
||||
* over which this {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use startIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #startIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getBeginIndex() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @deprecated ICU 2.2. Use endIndex() instead.
|
||||
* @return The codepoint as an int
|
||||
* @see #endIndex
|
||||
*/
|
||||
@Deprecated
|
||||
public int getEndIndex() {
|
||||
return endIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the current iteration position in the input text that is being
|
||||
* normalized. This method is useful in applications such as searching, where
|
||||
* you need to be able to determine the position in the input text that
|
||||
* corresponds to a given normalized output character.
|
||||
* <p>
|
||||
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
||||
* {@link #next} and {@link #previous} iterate through characters in the
|
||||
* <em>output</em>. This means that there is not necessarily a one-to-one
|
||||
* correspondence between characters returned by {@code next} and
|
||||
* {@code previous} and the indices passed to and returned from {@code setIndex}
|
||||
* and {@link #getIndex}.
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int getIndex() {
|
||||
if (bufferPos < buffer.length()) {
|
||||
return currentIndex;
|
||||
} else {
|
||||
return nextIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text. This is the end index of the
|
||||
* {@code CharacterIterator} or the length of the {@code String} over which this
|
||||
* {@code NormalizerBase} is iterating
|
||||
*
|
||||
* @return The current iteration position
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public int endIndex() {
|
||||
return text.getLength();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Iterator attributes
|
||||
// -------------------------------------------------------------------------
|
||||
/**
|
||||
* Set the normalization mode for this object.
|
||||
* <p>
|
||||
* <b>Note:</b>If the normalization mode is changed while iterating over a
|
||||
* string, calls to {@link #next} and {@link #previous} may return previously
|
||||
* buffers characters in the old normalization mode until the iteration is able
|
||||
* to re-sync at the next base character. It is safest to call {@link #setText
|
||||
* setText()}, {@link #first}, {@link #last}, etc. after calling
|
||||
* {@code setMode}.
|
||||
* <p>
|
||||
*
|
||||
* @param newMode the new mode for this {@code NormalizerBase}. The supported
|
||||
* modes are:
|
||||
* <ul>
|
||||
* <li>{@link #NFC} - Unicode canonical decompositiion followed
|
||||
* by canonical composition.
|
||||
* <li>{@link #NFKC} - Unicode compatibility decompositiion
|
||||
* follwed by canonical composition.
|
||||
* <li>{@link #NFD} - Unicode canonical decomposition
|
||||
* <li>{@link #NFKD} - Unicode compatibility decomposition.
|
||||
* <li>{@link #NONE} - Do nothing but return characters from the
|
||||
* underlying input text.
|
||||
* </ul>
|
||||
*
|
||||
* @see #getMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setMode(Mode newMode) {
|
||||
mode = newMode;
|
||||
norm2 = mode.getNormalizer2(options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the basic operation performed by this {@code NormalizerBase}
|
||||
*
|
||||
* @see #setMode
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public Mode getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(String newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text over which this {@code NormalizerBase} will iterate. The
|
||||
* iteration position is set to the beginning of the input text.
|
||||
*
|
||||
* @param newText The new string to be normalized.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public void setText(CharacterIterator newText) {
|
||||
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
|
||||
if (newIter == null) {
|
||||
throw new IllegalStateException("Could not create a new UCharacterIterator");
|
||||
}
|
||||
text = newIter;
|
||||
currentIndex = nextIndex = 0;
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
private void clearBuffer() {
|
||||
buffer.setLength(0);
|
||||
bufferPos = 0;
|
||||
}
|
||||
|
||||
private boolean nextNormalize() {
|
||||
clearBuffer();
|
||||
currentIndex = nextIndex;
|
||||
text.setIndex(nextIndex);
|
||||
// Skip at least one character so we make progress.
|
||||
int c = text.nextCodePoint();
|
||||
if (c < 0) {
|
||||
return false;
|
||||
}
|
||||
StringBuilder segment = new StringBuilder().appendCodePoint(c);
|
||||
while ((c = text.nextCodePoint()) >= 0) {
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
text.moveCodePointIndex(-1);
|
||||
break;
|
||||
}
|
||||
segment.appendCodePoint(c);
|
||||
}
|
||||
nextIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
private boolean previousNormalize() {
|
||||
clearBuffer();
|
||||
nextIndex = currentIndex;
|
||||
text.setIndex(currentIndex);
|
||||
StringBuilder segment = new StringBuilder();
|
||||
int c;
|
||||
while ((c = text.previousCodePoint()) >= 0) {
|
||||
if (c <= 0xffff) {
|
||||
segment.insert(0, (char) c);
|
||||
} else {
|
||||
segment.insert(0, Character.toChars(c));
|
||||
}
|
||||
if (norm2.hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
currentIndex = text.getIndex();
|
||||
norm2.normalize(segment, buffer);
|
||||
bufferPos = buffer.length();
|
||||
return buffer.length() != 0;
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user