/* * Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 2000-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package jdk_internal.icu.text; import jdk_internal.bidi.CharacterIterator; import jdk_internal.bidi.Normalizer; import jdk_internal.icu.impl.Norm2AllModes; /** * Unicode Normalization * *

Unicode normalization API

* * normalize transforms Unicode text into an equivalent composed or * decomposed form, allowing for easier sorting and searching of text. * normalize supports the standard normalization forms described in * Unicode * Standard Annex #15 — Unicode Normalization Forms. * * Characters with accents or other adornments can be encoded in several * different ways in Unicode. For example, take the character A-acute. In * Unicode, this can be encoded as a single character (the "composed" form): * *
 *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
 * 
* * or as two separate characters (the "decomposed" form): * *
 *      0041    LATIN CAPITAL LETTER A
 *      0301    COMBINING ACUTE ACCENT
 * 
* * To a user of your program, however, both of these sequences should be treated * as the same "user-level" character "A with acute accent". When you are * searching or comparing text, you must ensure that these two sequences are * treated equivalently. In addition, you must handle characters with more than * one accent. Sometimes the order of a character's combining accents is * significant, while in other cases accent sequences in different orders are * really equivalent. * * Similarly, the string "ffi" can be encoded as three separate letters: * *
 *      0066    LATIN SMALL LETTER F
 *      0066    LATIN SMALL LETTER F
 *      0069    LATIN SMALL LETTER I
 * 
* * or as the single character * *
 *      FB03    LATIN SMALL LIGATURE FFI
 * 
* * The ffi ligature is not a distinct semantic character, and strictly speaking * it shouldn't be in Unicode at all, but it was included for compatibility with * existing character sets that already provided it. The Unicode standard * identifies such characters by giving them "compatibility" decompositions into * the corresponding semantic characters. When sorting and searching, you will * often want to use these mappings. * * normalize helps solve these problems by transforming text into * the canonical composed and decomposed forms as shown in the first example * above. In addition, you can have it perform compatibility decompositions so * that you can treat compatibility characters the same as their equivalents. * Finally, normalize rearranges accents into the proper canonical * order, so that you do not have to worry about accent rearrangement on your * own. * * Form FCD, "Fast C or D", is also designed for collation. It allows to work on * strings that are not necessarily normalized with an algorithm (like in * collation) that works under "canonical closure", i.e., it treats precomposed * characters and their decomposed equivalents the same. * * It is not a normalization form because it does not provide for uniqueness of * representation. Multiple strings may be canonically equivalent (their NFDs * are identical) and may all conform to FCD without being identical themselves. * * The form is defined such that the "raw decomposition", the recursive * canonical decomposition of each character, results in a string that is * canonically ordered. This means that precomposed characters are allowed for * as long as their decompositions do not need canonical reordering. * * Its advantage for a process like collation is that all NFD and most NFC texts * - and many unnormalized texts - already conform to FCD and do not need to be * normalized (NFD) for such a process. The FCD quick check will return YES for * most strings in practice. * * normalize(FCD) may be implemented with NFD. * * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence * in Applications): http://www.unicode.org/notes/tn5/#FCD * * ICU collation performs either NFD or FCD normalization automatically if * normalization is turned on for the collator object. Beyond collation and * string search, normalized strings may be useful for string equivalence * comparisons, transliteration/transcription, unique representations, etc. * * The W3C generally recommends to exchange texts in NFC. Note also that most * legacy character encodings use only precomposed forms and often do not encode * any combining marks by themselves. For conversion to such character encodings * the Unicode text needs to be normalized to NFC. For more usage examples, see * the Unicode Standard Annex. * * Note: The Normalizer class also provides API for iterative normalization. * While the setIndex() and getIndex() refer to indices in the underlying * Unicode input text, the next() and previous() methods iterate through * characters in the normalized output. This means that there is not necessarily * a one-to-one correspondence between characters returned by next() and * previous() and the indices passed to and returned from setIndex() and * getIndex(). It is for this reason that Normalizer does not implement the * CharacterIterator interface. * * @stable ICU 2.8 */ // Original filename in ICU4J: Normalizer.java public final class NormalizerBase implements Cloneable { // The input text and our position in it private UCharacterIterator text; private Normalizer2 norm2; private Mode mode; private int options; // The normalization buffer is the result of normalization // of the source in [currentIndex..nextIndex] . private int currentIndex; private int nextIndex; // A buffer for holding intermediate results private StringBuilder buffer; private int bufferPos; // Helper classes to defer loading of normalization data. private static final class ModeImpl { private ModeImpl(Normalizer2 n2) { normalizer2 = n2; } private final Normalizer2 normalizer2; } private static final class NFDModeImpl { private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); } private static final class NFKDModeImpl { private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); } private static final class NFCModeImpl { private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); } private static final class NFKCModeImpl { private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); } private static final class Unicode32 { private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); } private static final class NFD32ModeImpl { private static final ModeImpl INSTANCE = new ModeImpl( new FilteredNormalizer2(Normalizer2.getNFDInstance(), Unicode32.INSTANCE)); } private static final class NFKD32ModeImpl { private static final ModeImpl INSTANCE = new ModeImpl( new FilteredNormalizer2(Normalizer2.getNFKDInstance(), Unicode32.INSTANCE)); } private static final class NFC32ModeImpl { private static final ModeImpl INSTANCE = new ModeImpl( new FilteredNormalizer2(Normalizer2.getNFCInstance(), Unicode32.INSTANCE)); } private static final class NFKC32ModeImpl { private static final ModeImpl INSTANCE = new ModeImpl( new FilteredNormalizer2(Normalizer2.getNFKCInstance(), Unicode32.INSTANCE)); } /** * Options bit set value to select Unicode 3.2 normalization (except * NormalizationCorrections). At most one Unicode version can be selected at a * time. * * @stable ICU 2.6 */ public static final int UNICODE_3_2 = 0x20; public static final int UNICODE_3_2_0_ORIGINAL = UNICODE_3_2; /* * Default option for the latest Unicode normalization. This option is provided * mainly for testing. The value zero means that normalization is done with the * fixes for - Corrigendum 4 (Five CJK Canonical Mapping Errors) - Corrigendum 5 * (Normalization Idempotency) */ public static final int UNICODE_LATEST = 0x00; /** * Constant indicating that the end of the iteration has been reached. This is * guaranteed to have the same value as {@link UCharacterIterator#DONE}. * * @stable ICU 2.8 */ public static final int DONE = UCharacterIterator.DONE; /** * Constants for normalization modes. *

* The Mode class is not intended for public subclassing. Only the Mode * constants provided by the Normalizer class should be used, and any fields or * methods should not be called or overridden by users. * * @stable ICU 2.8 */ public abstract static class Mode { /** * Sole constructor * * @internal * @deprecated This API is ICU internal only. */ @Deprecated protected Mode() { } /** * @internal * @deprecated This API is ICU internal only. */ @Deprecated protected abstract Normalizer2 getNormalizer2(int options); } private static Mode toMode(Normalizer.Form form) { switch (form) { case NFC: return NFC; case NFD: return NFD; case NFKC: return NFKC; case NFKD: return NFKD; } throw new IllegalArgumentException("Unexpected normalization form: " + form); } private static final class NONEMode extends Mode { protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } } private static final class NFDMode extends Mode { protected Normalizer2 getNormalizer2(int options) { return (options & UNICODE_3_2) != 0 ? NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2; } } private static final class NFKDMode extends Mode { protected Normalizer2 getNormalizer2(int options) { return (options & UNICODE_3_2) != 0 ? NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2; } } private static final class NFCMode extends Mode { protected Normalizer2 getNormalizer2(int options) { return (options & UNICODE_3_2) != 0 ? NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2; } } private static final class NFKCMode extends Mode { protected Normalizer2 getNormalizer2(int options) { return (options & UNICODE_3_2) != 0 ? NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2; } } /** * No decomposition/composition. * * @stable ICU 2.8 */ public static final Mode NONE = new NONEMode(); /** * Canonical decomposition. * * @stable ICU 2.8 */ public static final Mode NFD = new NFDMode(); /** * Compatibility decomposition. * * @stable ICU 2.8 */ public static final Mode NFKD = new NFKDMode(); /** * Canonical decomposition followed by canonical composition. * * @stable ICU 2.8 */ public static final Mode NFC = new NFCMode(); public static final Mode NFKC = new NFKCMode(); // ------------------------------------------------------------------------- // Iterator constructors // ------------------------------------------------------------------------- /** * Creates a new {@code NormalizerBase} object for iterating over the normalized * form of a given string. *

* The {@code options} parameter specifies which optional {@code NormalizerBase} * features are to be enabled for this object. *

* * @param str The string to be normalized. The normalization will start at the * beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. Currently the only available * option is {@link #UNICODE_3_2}. If you want the default behavior * corresponding to one of the standard Unicode Normalization Forms, * use 0 for this argument. * @stable ICU 2.6 */ public NormalizerBase(String str, Mode mode, int opt) { this.text = UCharacterIterator.getInstance(str); this.mode = mode; this.options = opt; norm2 = mode.getNormalizer2(opt); buffer = new StringBuilder(); } public NormalizerBase(String str, Mode mode) { this(str, mode, 0); } /** * Creates a new {@code NormalizerBase} object for iterating over the normalized * form of the given text. *

* * @param iter The input text to be normalized. The normalization will start at * the beginning of the string. * * @param mode The normalization mode. * * @param opt Any optional features to be enabled. Currently the only available * option is {@link #UNICODE_3_2}. If you want the default behavior * corresponding to one of the standard Unicode Normalization Forms, * use 0 for this argument. * @stable ICU 2.6 */ public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { this.text = UCharacterIterator.getInstance((CharacterIterator) iter.clone()); this.mode = mode; this.options = opt; norm2 = mode.getNormalizer2(opt); buffer = new StringBuilder(); } public NormalizerBase(CharacterIterator iter, Mode mode) { this(iter, mode, 0); } /** * Clones this {@code NormalizerBase} object. All properties of this object are * duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor or to * {@link #setText(CharacterIterator) setText}. However, the text storage * underlying the {@code CharacterIterator} is not duplicated unless the * iterator's {@code clone} method does so. * * @stable ICU 2.8 */ public Object clone() { try { NormalizerBase copy = (NormalizerBase) super.clone(); copy.text = (UCharacterIterator) text.clone(); copy.mode = mode; copy.options = options; copy.norm2 = norm2; copy.buffer = new StringBuilder(buffer); copy.bufferPos = bufferPos; copy.currentIndex = currentIndex; copy.nextIndex = nextIndex; return copy; } catch (CloneNotSupportedException e) { throw new InternalError(e.toString(), e); } } /** * Normalizes a {@code String} using the given normalization operation. *

* The {@code options} parameter specifies which optional {@code NormalizerBase} * features are to be enabled for this operation. Currently the only available * option is {@link #UNICODE_3_2}. If you want the default behavior * corresponding to one of the standard Unicode Normalization Forms, use 0 for * this argument. *

* * @param str the input string to be normalized. * @param mode the normalization mode * @param options the optional features to be enabled. * @return String the normalized string * @stable ICU 2.6 */ public static String normalize(String str, Mode mode, int options) { return mode.getNormalizer2(options).normalize(str); } public static String normalize(String str, Normalizer.Form form) { return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); } public static String normalize(String str, Normalizer.Form form, int options) { return NormalizerBase.normalize(str, toMode(form), options); } /** * Test if a string is in a given normalization form. This is semantically * equivalent to source.equals(normalize(source, mode)). * * Unlike quickCheck(), this function returns a definitive result, never a * "maybe". For NFD, NFKD, and FCD, both functions work exactly the same. For * NFC and NFKC where quickCheck may return "maybe", this function will perform * further tests to arrive at a true/false result. * * @param str the input string to be checked to see if it is normalized * @param mode the normalization mode * @param options Options for use with exclusion set and tailored Normalization * The only option that is currently recognized is UNICODE_3_2 * @see #isNormalized * @stable ICU 2.6 */ public static boolean isNormalized(String str, Mode mode, int options) { return mode.getNormalizer2(options).isNormalized(str); } public static boolean isNormalized(String str, Normalizer.Form form) { return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); } public static boolean isNormalized(String str, Normalizer.Form form, int options) { return NormalizerBase.isNormalized(str, toMode(form), options); } // ------------------------------------------------------------------------- // Iteration API // ------------------------------------------------------------------------- /** * Return the current character in the normalized text. * * @return The codepoint as an int * @stable ICU 2.8 */ public int current() { if (bufferPos < buffer.length() || nextNormalize()) { return buffer.codePointAt(bufferPos); } else { return DONE; } } /** * Return the next character in the normalized text and advance the iteration * position by one. If the end of the text has already been reached, * {@link #DONE} is returned. * * @return The codepoint as an int * @stable ICU 2.8 */ public int next() { if (bufferPos < buffer.length() || nextNormalize()) { int c = buffer.codePointAt(bufferPos); bufferPos += Character.charCount(c); return c; } else { return DONE; } } /** * Return the previous character in the normalized text and decrement the * iteration position by one. If the beginning of the text has already been * reached, {@link #DONE} is returned. * * @return The codepoint as an int * @stable ICU 2.8 */ public int previous() { if (bufferPos > 0 || previousNormalize()) { int c = buffer.codePointBefore(bufferPos); bufferPos -= Character.charCount(c); return c; } else { return DONE; } } /** * Reset the index to the beginning of the text. This is equivalent to * setIndexOnly(startIndex)). * * @stable ICU 2.8 */ public void reset() { text.setIndex(0); currentIndex = nextIndex = 0; clearBuffer(); } /** * Set the iteration position in the input text that is being normalized, * without any immediate normalization. After setIndexOnly(), getIndex() will * return the same index that is specified here. * * @param index the desired index in the input text. * @stable ICU 2.8 */ public void setIndexOnly(int index) { text.setIndex(index); // validates index currentIndex = nextIndex = index; clearBuffer(); } /** * Set the iteration position in the input text that is being normalized and * return the first normalized character at that position. *

* Note: This method sets the position in the input text, while * {@link #next} and {@link #previous} iterate through characters in the * normalized output. This means that there is not necessarily a * one-to-one correspondence between characters returned by {@code next} and * {@code previous} and the indices passed to and returned from {@code setIndex} * and {@link #getIndex}. *

* * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating * forward starting at the given index. * * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than * {@link #getEndIndex}. deprecated ICU 3.2 * @obsolete ICU 3.2 */ public int setIndex(int index) { setIndexOnly(index); return current(); } /** * Retrieve the index of the start of the input text. This is the begin index of * the {@code CharacterIterator} or the start (i.e. 0) of the {@code String} * over which this {@code NormalizerBase} is iterating * * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex */ @Deprecated public int getBeginIndex() { return 0; } /** * Retrieve the index of the end of the input text. This is the end index of the * {@code CharacterIterator} or the length of the {@code String} over which this * {@code NormalizerBase} is iterating * * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex */ @Deprecated public int getEndIndex() { return endIndex(); } /** * Retrieve the current iteration position in the input text that is being * normalized. This method is useful in applications such as searching, where * you need to be able to determine the position in the input text that * corresponds to a given normalized output character. *

* Note: This method sets the position in the input, while * {@link #next} and {@link #previous} iterate through characters in the * output. This means that there is not necessarily a one-to-one * correspondence between characters returned by {@code next} and * {@code previous} and the indices passed to and returned from {@code setIndex} * and {@link #getIndex}. * * @return The current iteration position * @stable ICU 2.8 */ public int getIndex() { if (bufferPos < buffer.length()) { return currentIndex; } else { return nextIndex; } } /** * Retrieve the index of the end of the input text. This is the end index of the * {@code CharacterIterator} or the length of the {@code String} over which this * {@code NormalizerBase} is iterating * * @return The current iteration position * @stable ICU 2.8 */ public int endIndex() { return text.getLength(); } // ------------------------------------------------------------------------- // Iterator attributes // ------------------------------------------------------------------------- /** * Set the normalization mode for this object. *

* Note:If the normalization mode is changed while iterating over a * string, calls to {@link #next} and {@link #previous} may return previously * buffers characters in the old normalization mode until the iteration is able * to re-sync at the next base character. It is safest to call {@link #setText * setText()}, {@link #first}, {@link #last}, etc. after calling * {@code setMode}. *

* * @param newMode the new mode for this {@code NormalizerBase}. The supported * modes are: *

* * @see #getMode * @stable ICU 2.8 */ public void setMode(Mode newMode) { mode = newMode; norm2 = mode.getNormalizer2(options); } /** * Return the basic operation performed by this {@code NormalizerBase} * * @see #setMode * @stable ICU 2.8 */ public Mode getMode() { return mode; } /** * Set the input text over which this {@code NormalizerBase} will iterate. The * iteration position is set to the beginning of the input text. * * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) { UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; reset(); } /** * Set the input text over which this {@code NormalizerBase} will iterate. The * iteration position is set to the beginning of the input text. * * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) { UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; currentIndex = nextIndex = 0; clearBuffer(); } private void clearBuffer() { buffer.setLength(0); bufferPos = 0; } private boolean nextNormalize() { clearBuffer(); currentIndex = nextIndex; text.setIndex(nextIndex); // Skip at least one character so we make progress. int c = text.nextCodePoint(); if (c < 0) { return false; } StringBuilder segment = new StringBuilder().appendCodePoint(c); while ((c = text.nextCodePoint()) >= 0) { if (norm2.hasBoundaryBefore(c)) { text.moveCodePointIndex(-1); break; } segment.appendCodePoint(c); } nextIndex = text.getIndex(); norm2.normalize(segment, buffer); return buffer.length() != 0; } private boolean previousNormalize() { clearBuffer(); nextIndex = currentIndex; text.setIndex(currentIndex); StringBuilder segment = new StringBuilder(); int c; while ((c = text.previousCodePoint()) >= 0) { if (c <= 0xffff) { segment.insert(0, (char) c); } else { segment.insert(0, Character.toChars(c)); } if (norm2.hasBoundaryBefore(c)) { break; } } currentIndex = text.getIndex(); norm2.normalize(segment, buffer); bufferPos = buffer.length(); return buffer.length() != 0; } }