Update #48 - Added some features from OptiFine

2025-06-28 02:48:14 -05:00 · 2025-01-24 18:39:36 -08:00
parent 1f0d593a8c
commit e83a912e38
1056 changed files with 17706 additions and 898 deletions
--- a/sources/main/java/jdk_internal/icu/text/StringPrep.java
+++ b/sources/main/java/jdk_internal/icu/text/StringPrep.java
@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+/*
+ *******************************************************************************
+ * Copyright (C) 2003-2004, International Business Machines Corporation and         *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+//
+// CHANGELOG
+//      2005-05-19 Edward Wang
+//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
+//          - move from package com.ibm.icu.text to package sun.net.idn
+//          - use ParseException instead of StringPrepParseException
+//          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
+//          - remove all @deprecated tag to make compiler happy
+//      2007-08-14 Martin Buchholz
+//          - remove redundant casts
+//
+package jdk_internal.icu.text;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import jdk_internal.bidi.Normalizer;
+import jdk_internal.bidi.ParseException;
+import jdk_internal.bidi.SunNormalizer;
+import jdk_internal.icu.impl.CharTrie;
+import jdk_internal.icu.impl.StringPrepDataReader;
+import jdk_internal.icu.impl.Trie;
+import jdk_internal.icu.lang.UCharacter;
+import jdk_internal.icu.lang.UCharacterDirection;
+import jdk_internal.icu.util.VersionInfo;
+
+/**
+ * StringPrep API implements the StingPrep framework as described by
+ * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. StringPrep
+ * prepares Unicode strings for use in network protocols. Profiles of StingPrep
+ * are set of rules and data according to which the Unicode Strings are
+ * prepared. Each profiles contains tables which describe how a code point
+ * should be treated. The tables are broadly classied into
+ * <ul>
+ * <li>Unassigned Table: Contains code points that are unassigned in the Unicode
+ * Version supported by StringPrep. Currently RFC 3454 supports Unicode 3.2.
+ * </li>
+ * <li>Prohibited Table: Contains code points that are prohibted from the output
+ * of the StringPrep processing function.</li>
+ * <li>Mapping Table: Contains code ponts that are deleted from the output or
+ * case mapped.</li>
+ * </ul>
+ *
+ * The procedure for preparing Unicode strings:
+ * <ol>
+ * <li>Map: For each character in the input, check if it has a mapping and, if
+ * so, replace it with its mapping.</li>
+ * <li>Normalize: Possibly normalize the result of step 1 using Unicode
+ * normalization.</li>
+ * <li>Prohibit: Check for any characters that are not allowed in the output. If
+ * any are found, return an error.</li>
+ * <li>Check bidi: Possibly check for right-to-left characters, and if any are
+ * found, make sure that the whole string satisfies the requirements for
+ * bidirectional strings. If the string does not satisfy the requirements for
+ * bidirectional strings, return an error.</li>
+ * </ol>
+ * 
+ * @author Ram Viswanadha
+ * @draft ICU 2.8
+ */
+public final class StringPrep {
+	/**
+	 * Option to prohibit processing of unassigned code points in the input
+	 *
+	 * @see #prepare
+	 * @draft ICU 2.8
+	 */
+	public static final int DEFAULT = 0x0000;
+
+	/**
+	 * Option to allow processing of unassigned code points in the input
+	 *
+	 * @see #prepare
+	 * @draft ICU 2.8
+	 */
+	public static final int ALLOW_UNASSIGNED = 0x0001;
+
+	private static final int UNASSIGNED = 0x0000;
+	private static final int MAP = 0x0001;
+	private static final int PROHIBITED = 0x0002;
+	private static final int DELETE = 0x0003;
+	private static final int TYPE_LIMIT = 0x0004;
+
+	private static final int NORMALIZATION_ON = 0x0001;
+	private static final int CHECK_BIDI_ON = 0x0002;
+
+	private static final int TYPE_THRESHOLD = 0xFFF0;
+	private static final int MAX_INDEX_VALUE = 0x3FBF; /* 16139 */
+	private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
+
+	/* indexes[] value names */
+	private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
+	private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
+	private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /*
+																	 * The index of Unicode version of last entry in
+																	 * NormalizationCorrections.txt
+																	 */
+	private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /*
+																 * The starting index of 1 UChar mapping index in the
+																 * mapping data array
+																 */
+	private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /*
+																	 * The starting index of 2 UChars mapping index in
+																	 * the mapping data array
+																	 */
+	private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
+	private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
+	private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
+	private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
+
+	/**
+	 * Default buffer size of datafile
+	 */
+	private static final int DATA_BUFFER_SIZE = 25000;
+
+	/* Wrappers for Trie implementations */
+	private static final class StringPrepTrieImpl implements Trie.DataManipulate {
+		private CharTrie sprepTrie = null;
+
+		/**
+		 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's data the
+		 * index array offset of the indexes for that lead surrogate.
+		 * 
+		 * @param property data value for a surrogate from the trie, including the
+		 *                 folding offset
+		 * @return data offset or 0 if there is no data for the lead surrogate
+		 */
+		public int getFoldingOffset(int value) {
+			return value;
+		}
+	}
+
+	// CharTrie implementation for reading the trie data
+	private StringPrepTrieImpl sprepTrieImpl;
+	// Indexes read from the data file
+	private int[] indexes;
+	// mapping data read from the data file
+	private char[] mappingData;
+	// format version of the data file
+	private byte[] formatVersion;
+	// the version of Unicode supported by the data file
+	private VersionInfo sprepUniVer;
+	// the Unicode version of last entry in the
+	// NormalizationCorrections.txt file if normalization
+	// is turned on
+	private VersionInfo normCorrVer;
+	// Option to turn on Normalization
+	private boolean doNFKC;
+	// Option to turn on checking for BiDi rules
+	private boolean checkBiDi;
+
+	private char getCodePointValue(int ch) {
+		return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
+	}
+
+	private static VersionInfo getVersionInfo(int comp) {
+		int micro = comp & 0xFF;
+		int milli = (comp >> 8) & 0xFF;
+		int minor = (comp >> 16) & 0xFF;
+		int major = (comp >> 24) & 0xFF;
+		return VersionInfo.getInstance(major, minor, milli, micro);
+	}
+
+	private static VersionInfo getVersionInfo(byte[] version) {
+		if (version.length != 4) {
+			return null;
+		}
+		return VersionInfo.getInstance((int) version[0], (int) version[1], (int) version[2], (int) version[3]);
+	}
+
+	/**
+	 * Creates an StringPrep object after reading the input stream. The object does
+	 * not hold a reference to the input steam, so the stream can be closed after
+	 * the method returns.
+	 *
+	 * @param inputStream The stream for reading the StringPrep profile binarySun
+	 * @throws IOException
+	 * @draft ICU 2.8
+	 */
+	public StringPrep(InputStream inputStream) throws IOException {
+
+		BufferedInputStream b = new BufferedInputStream(inputStream, DATA_BUFFER_SIZE);
+
+		StringPrepDataReader reader = new StringPrepDataReader(b);
+
+		// read the indexes
+		indexes = reader.readIndexes(INDEX_TOP);
+
+		byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
+
+		// indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
+		mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE] / 2];
+		// load the rest of the data and initialize the data members
+		reader.read(sprepBytes, mappingData);
+
+		sprepTrieImpl = new StringPrepTrieImpl();
+		sprepTrieImpl.sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes), sprepTrieImpl);
+
+		// get the data format version
+		formatVersion = reader.getDataFormatVersion();
+
+		// get the options
+		doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
+		checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
+		sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
+		normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
+		VersionInfo normUniVer = UCharacter.getUnicodeVersion();
+		if (normUniVer.compareTo(sprepUniVer) < 0 && /*
+														 * the Unicode version of SPREP file must be less than the
+														 * Unicode Vesion of the normalization data
+														 */
+				normUniVer.compareTo(normCorrVer) < 0
+				&& /*
+					 * the Unicode version of the NormalizationCorrections.txt file should be less
+					 * than the Unicode Vesion of the normalization data
+					 */
+				((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on */
+		) {
+			throw new IOException("Normalization Correction version not supported");
+		}
+		b.close();
+	}
+
+	private static final class Values {
+		boolean isIndex;
+		int value;
+		int type;
+
+		public void reset() {
+			isIndex = false;
+			value = 0;
+			type = -1;
+		}
+	}
+
+	private static final void getValues(char trieWord, Values values) {
+		values.reset();
+		if (trieWord == 0) {
+			/*
+			 * Initial value stored in the mapping table just return TYPE_LIMIT .. so that
+			 * the source codepoint is copied to the destination
+			 */
+			values.type = TYPE_LIMIT;
+		} else if (trieWord >= TYPE_THRESHOLD) {
+			values.type = (trieWord - TYPE_THRESHOLD);
+		} else {
+			/* get the type */
+			values.type = MAP;
+			/* ascertain if the value is index or delta */
+			if ((trieWord & 0x02) > 0) {
+				values.isIndex = true;
+				values.value = trieWord >> 2; // mask off the lower 2 bits and shift
+
+			} else {
+				values.isIndex = false;
+				values.value = (trieWord << 16) >> 16;
+				values.value = (values.value >> 2);
+
+			}
+
+			if ((trieWord >> 2) == MAX_INDEX_VALUE) {
+				values.type = DELETE;
+				values.isIndex = false;
+				values.value = 0;
+			}
+		}
+	}
+
+	private StringBuffer map(UCharacterIterator iter, int options) throws ParseException {
+
+		Values val = new Values();
+		char result = 0;
+		int ch = UCharacterIterator.DONE;
+		StringBuffer dest = new StringBuffer();
+		boolean allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
+
+		while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
+
+			result = getCodePointValue(ch);
+			getValues(result, val);
+
+			// check if the source codepoint is unassigned
+			if (val.type == UNASSIGNED && allowUnassigned == false) {
+				throw new ParseException("An unassigned code point was found in the input " + iter.getText(),
+						iter.getIndex());
+			} else if ((val.type == MAP)) {
+				int index, length;
+
+				if (val.isIndex) {
+					index = val.value;
+					if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START]
+							&& index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) {
+						length = 1;
+					} else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START]
+							&& index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) {
+						length = 2;
+					} else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START]
+							&& index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) {
+						length = 3;
+					} else {
+						length = mappingData[index++];
+					}
+					/* copy mapping to destination */
+					dest.append(mappingData, index, length);
+					continue;
+
+				} else {
+					ch -= val.value;
+				}
+			} else if (val.type == DELETE) {
+				// just consume the codepoint and contine
+				continue;
+			}
+			// copy the source into destination
+			UTF16.append(dest, ch);
+		}
+
+		return dest;
+	}
+
+	private StringBuffer normalize(StringBuffer src) {
+		/*
+		 * Option UNORM_BEFORE_PRI_29:
+		 *
+		 * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
+		 * requires strict adherence to Unicode 3.2 normalization, including buggy
+		 * composition from before fixing Public Review Issue #29. Note that this
+		 * results in some valid but nonsensical text to be either corrupted or
+		 * rejected, depending on the text. See
+		 * http://www.unicode.org/review/resolved-pri.html#pri29 See unorm.cpp and
+		 * cnormtst.c
+		 */
+		return new StringBuffer(
+				SunNormalizer.normalize(src.toString(), Normalizer.Form.NFKC, SunNormalizer.UNICODE_3_2));
+	}
+
+	/*
+	 * boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if(
+	 * (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; }
+	 */
+	/*
+	 * 1) Map -- For each character in the input, check if it has a mapping and, if
+	 * so, replace it with its mapping.
+	 * 
+	 * 2) Normalize -- Possibly normalize the result of step 1 using Unicode
+	 * normalization.
+	 * 
+	 * 3) Prohibit -- Check for any characters that are not allowed in the output.
+	 * If any are found, return an error.
+	 * 
+	 * 4) Check bidi -- Possibly check for right-to-left characters, and if any are
+	 * found, make sure that the whole string satisfies the requirements for
+	 * bidirectional strings. If the string does not satisfy the requirements for
+	 * bidirectional strings, return an error. [Unicode3.2] defines several
+	 * bidirectional categories; each character has one bidirectional category
+	 * assigned to it. For the purposes of the requirements below, an
+	 * "RandALCat character" is a character that has Unicode bidirectional
+	 * categories "R" or "AL"; an "LCat character" is a character that has Unicode
+	 * bidirectional category "L". Note
+	 * 
+	 * 
+	 * that there are many characters which fall in neither of the above
+	 * definitions; Latin digits (<U+0030> through <U+0039>) are examples of this
+	 * because they have bidirectional category "EN".
+	 * 
+	 * In any profile that specifies bidirectional character handling, all three of
+	 * the following requirements MUST be met:
+	 * 
+	 * 1) The characters in section 5.8 MUST be prohibited.
+	 * 
+	 * 2) If a string contains any RandALCat character, the string MUST NOT contain
+	 * any LCat character.
+	 * 
+	 * 3) If a string contains any RandALCat character, a RandALCat character MUST
+	 * be the first character of the string, and a RandALCat character MUST be the
+	 * last character of the string.
+	 */
+	/**
+	 * Prepare the input buffer for use in applications with the given profile. This
+	 * operation maps, normalizes(NFKC), checks for prohited and BiDi characters in
+	 * the order defined by RFC 3454 depending on the options specified in the
+	 * profile.
+	 *
+	 * @param src     A UCharacterIterator object containing the source string
+	 * @param options A bit set of options:
+	 *
+	 *                - StringPrep.NONE Prohibit processing of unassigned code
+	 *                points in the input
+	 *
+	 *                - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points
+	 *                are in the input as normal Unicode code points.
+	 *
+	 * @return StringBuffer A StringBuffer containing the output
+	 * @throws ParseException
+	 * @draft ICU 2.8
+	 */
+	public StringBuffer prepare(UCharacterIterator src, int options) throws ParseException {
+
+		// map
+		StringBuffer mapOut = map(src, options);
+		StringBuffer normOut = mapOut;// initialize
+
+		if (doNFKC) {
+			// normalize
+			normOut = normalize(mapOut);
+		}
+
+		int ch;
+		char result;
+		UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
+		Values val = new Values();
+		int direction = UCharacterDirection.CHAR_DIRECTION_COUNT,
+				firstCharDir = UCharacterDirection.CHAR_DIRECTION_COUNT;
+		int rtlPos = -1, ltrPos = -1;
+		boolean rightToLeft = false, leftToRight = false;
+
+		while ((ch = iter.nextCodePoint()) != UCharacterIterator.DONE) {
+			result = getCodePointValue(ch);
+			getValues(result, val);
+
+			if (val.type == PROHIBITED) {
+				throw new ParseException("A prohibited code point was found in the input" + iter.getText(), val.value);
+			}
+
+			direction = UCharacter.getDirection(ch);
+			if (firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT) {
+				firstCharDir = direction;
+			}
+			if (direction == UCharacterDirection.LEFT_TO_RIGHT) {
+				leftToRight = true;
+				ltrPos = iter.getIndex() - 1;
+			}
+			if (direction == UCharacterDirection.RIGHT_TO_LEFT
+					|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) {
+				rightToLeft = true;
+				rtlPos = iter.getIndex() - 1;
+			}
+		}
+		if (checkBiDi == true) {
+			// satisfy 2
+			if (leftToRight == true && rightToLeft == true) {
+				throw new ParseException(
+						"The input does not conform to the rules for BiDi code points." + iter.getText(),
+						(rtlPos > ltrPos) ? rtlPos : ltrPos);
+			}
+
+			// satisfy 3
+			if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT
+					|| firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)
+					&& (direction == UCharacterDirection.RIGHT_TO_LEFT
+							|| direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))) {
+				throw new ParseException(
+						"The input does not conform to the rules for BiDi code points." + iter.getText(),
+						(rtlPos > ltrPos) ? rtlPos : ltrPos);
+			}
+		}
+		return normOut;
+
+	}
+}