Update #48 - Added some features from OptiFine

2025-06-28 02:48:14 -05:00 · 2025-01-24 18:39:36 -08:00
parent 1f0d593a8c
commit e83a912e38
1056 changed files with 17706 additions and 898 deletions
--- a/sources/main/java/jdk_internal/icu/text/UTF16.java
+++ b/sources/main/java/jdk_internal/icu/text/UTF16.java
@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/**
+ *******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package jdk_internal.icu.text;
+
+import jdk_internal.icu.impl.UCharacterProperty;
+
+/**
+ * <p>
+ * Standalone utility class providing UTF16 character conversions and indexing
+ * conversions.
+ * <p>
+ * Code that uses strings alone rarely need modification. By design, UTF-16 does
+ * not allow overlap, so searching for strings is a safe operation. Similarly,
+ * concatenation is always safe. Substringing is safe if the start and end are
+ * both on UTF-32 boundaries. In normal code, the values for start and end are
+ * on those boundaries, since they arose from operations like searching. If not,
+ * the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
+ * <strong>Examples:</strong>
+ * <p>
+ * The following examples illustrate use of some of these methods.
+ * 
+ * <pre>{@code
+ * // iteration forwards: Original
+ * for (int i = 0; i < s.length(); ++i) {
+ * 	char ch = s.charAt(i);
+ * 	doSomethingWith(ch);
+ * }
+ *
+ * // iteration forwards: Changes for UTF-32
+ * int ch;
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ * 	ch = UTF16.charAt(s, i);
+ * 	doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Original
+ * for (int i = s.length() - 1; i >= 0; --i) {
+ * 	char ch = s.charAt(i);
+ * 	doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Changes for UTF-32
+ * int ch;
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ * 	ch = UTF16.charAt(s, i);
+ * 	doSomethingWith(ch);
+ * }
+ * }</pre>
+ * 
+ * <strong>Notes:</strong>
+ * <ul>
+ * <li><strong>Naming:</strong> For clarity, High and Low surrogates are called
+ * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
+ * sense of their ordering in a string. <code>offset16</code> and
+ * <code>offset32</code> are used to distinguish offsets to UTF-16 boundaries vs
+ * offsets to UTF-32 boundaries. <code>int char32</code> is used to contain
+ * UTF-32 characters, as opposed to <code>char16</code>, which is a UTF-16 code
+ * unit.</li>
+ * <li><strong>Roundtripping Offsets:</strong> You can always roundtrip from a
+ * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
+ * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and back
+ * if and only if <code>bounds(string, offset16) != TRAIL</code>.</li>
+ * <li><strong>Exceptions:</strong> The error checking will throw an exception
+ * if indices are out of bounds. Other than that, all methods will behave
+ * reasonably, even if unmatched surrogates or out-of-bounds UTF-32 values are
+ * present. <code>UCharacter.isLegal()</code> can be used to check for validity
+ * if desired.</li>
+ * <li><strong>Unmatched Surrogates:</strong> If the string contains unmatched
+ * surrogates, then these are counted as one UTF-32 value. This matches their
+ * iteration behavior, which is vital. It also matches common display practice
+ * as missing glyphs (see the Unicode Standard Section 5.4, 5.5).</li>
+ * <li><strong>Optimization:</strong> The method implementations may need
+ * optimization if the compiler doesn't fold static final methods. Since
+ * surrogate pairs will form an exceeding small percentage of all the text in
+ * the world, the singleton case should always be optimized for.</li>
+ * </ul>
+ * 
+ * @author Mark Davis, with help from Markus Scherer
+ * @stable ICU 2.1
+ */
+
+public final class UTF16 {
+	// public variables ---------------------------------------------------
+
+	/**
+	 * The lowest Unicode code point value.
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int CODEPOINT_MIN_VALUE = 0;
+	/**
+	 * The highest Unicode code point value (scalar value) according to the Unicode
+	 * Standard.
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
+	/**
+	 * The minimum value for Supplementary code points
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
+	/**
+	 * Lead surrogate minimum value
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
+	/**
+	 * Trail surrogate minimum value
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
+	/**
+	 * Lead surrogate maximum value
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
+	/**
+	 * Trail surrogate maximum value
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
+	/**
+	 * Surrogate minimum value
+	 * 
+	 * @stable ICU 2.1
+	 */
+	public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
+	/**
+	 * Lead surrogate bitmask
+	 */
+	private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
+	/**
+	 * Trail surrogate bitmask
+	 */
+	private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
+	/**
+	 * Surrogate bitmask
+	 */
+	private static final int SURROGATE_BITMASK = 0xFFFFF800;
+	/**
+	 * Lead surrogate bits
+	 */
+	private static final int LEAD_SURROGATE_BITS = 0xD800;
+	/**
+	 * Trail surrogate bits
+	 */
+	private static final int TRAIL_SURROGATE_BITS = 0xDC00;
+	/**
+	 * Surrogate bits
+	 */
+	private static final int SURROGATE_BITS = 0xD800;
+
+	// constructor --------------------------------------------------------
+
+	// /CLOVER:OFF
+	/**
+	 * Prevent instance from being created.
+	 */
+	private UTF16() {
+	}
+
+	// /CLOVER:ON
+	// public method ------------------------------------------------------
+
+	/**
+	 * Extract a single UTF-32 value from a string. Used when iterating forwards or
+	 * backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
+	 * If a validity check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">
+	 * UCharacter.isLegal()</a></code> on the return value. If the char retrieved is
+	 * part of a surrogate pair, its supplementary character will be returned. If a
+	 * complete supplementary character is not found the incomplete character will
+	 * be returned
+	 * 
+	 * @param source   array of UTF-16 chars
+	 * @param offset16 UTF-16 offset to the start of the character.
+	 * @return UTF-32 value for the UTF-32 value that contains the char at offset16.
+	 *         The boundaries of that codepoint are the same as in
+	 *         <code>bounds32()</code>.
+	 * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
+	 * @stable ICU 2.1
+	 */
+	public static int charAt(String source, int offset16) {
+		char single = source.charAt(offset16);
+		if (single < LEAD_SURROGATE_MIN_VALUE) {
+			return single;
+		}
+		return _charAt(source, offset16, single);
+	}
+
+	private static int _charAt(String source, int offset16, char single) {
+		if (single > TRAIL_SURROGATE_MAX_VALUE) {
+			return single;
+		}
+
+		// Convert the UTF-16 surrogate pair if necessary.
+		// For simplicity in usage, and because the frequency of pairs is
+		// low, look both directions.
+
+		if (single <= LEAD_SURROGATE_MAX_VALUE) {
+			++offset16;
+			if (source.length() != offset16) {
+				char trail = source.charAt(offset16);
+				if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
+					return UCharacterProperty.getRawSupplementary(single, trail);
+				}
+			}
+		} else {
+			--offset16;
+			if (offset16 >= 0) {
+				// single is a trail surrogate so
+				char lead = source.charAt(offset16);
+				if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
+					return UCharacterProperty.getRawSupplementary(lead, single);
+				}
+			}
+		}
+		return single; // return unmatched surrogate
+	}
+
+	/**
+	 * Extract a single UTF-32 value from a string. Used when iterating forwards or
+	 * backwards (with <code>UTF16.getCharCount()</code>, as well as random access.
+	 * If a validity check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
+	 * </a></code> on the return value. If the char retrieved is part of a surrogate
+	 * pair, its supplementary character will be returned. If a complete
+	 * supplementary character is not found the incomplete character will be
+	 * returned
+	 * 
+	 * @param source   array of UTF-16 chars
+	 * @param offset16 UTF-16 offset to the start of the character.
+	 * @return UTF-32 value for the UTF-32 value that contains the char at offset16.
+	 *         The boundaries of that codepoint are the same as in
+	 *         <code>bounds32()</code>.
+	 * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
+	 * @stable ICU 2.1
+	 */
+	public static int charAt(CharSequence source, int offset16) {
+		char single = source.charAt(offset16);
+		if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
+			return single;
+		}
+		return _charAt(source, offset16, single);
+	}
+
+	private static int _charAt(CharSequence source, int offset16, char single) {
+		if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+			return single;
+		}
+
+		// Convert the UTF-16 surrogate pair if necessary.
+		// For simplicity in usage, and because the frequency of pairs is
+		// low, look both directions.
+
+		if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+			++offset16;
+			if (source.length() != offset16) {
+				char trail = source.charAt(offset16);
+				if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+					return UCharacterProperty.getRawSupplementary(single, trail);
+				}
+			}
+		} else {
+			--offset16;
+			if (offset16 >= 0) {
+				// single is a trail surrogate so
+				char lead = source.charAt(offset16);
+				if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+					return UCharacterProperty.getRawSupplementary(lead, single);
+				}
+			}
+		}
+		return single; // return unmatched surrogate
+	}
+
+	/**
+	 * Extract a single UTF-32 value from a substring. Used when iterating forwards
+	 * or backwards (with <code>UTF16.getCharCount()</code>, as well as random
+	 * access. If a validity check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
+	 * </a></code> on the return value. If the char retrieved is part of a surrogate
+	 * pair, its supplementary character will be returned. If a complete
+	 * supplementary character is not found the incomplete character will be
+	 * returned
+	 *
+	 * @param source   Array of UTF-16 chars
+	 * @param start    Offset to substring in the source array for analyzing
+	 * @param limit    Offset to substring in the source array for analyzing
+	 * @param offset16 UTF-16 offset relative to start
+	 * @return UTF-32 value for the UTF-32 value that contains the char at offset16.
+	 *         The boundaries of that codepoint are the same as in
+	 *         <code>bounds32()</code>.
+	 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the
+	 *                                      range of start and limit.
+	 * @stable ICU 2.1
+	 */
+	public static int charAt(char source[], int start, int limit, int offset16) {
+		offset16 += start;
+		if (offset16 < start || offset16 >= limit) {
+			throw new ArrayIndexOutOfBoundsException(offset16);
+		}
+
+		char single = source[offset16];
+		if (!isSurrogate(single)) {
+			return single;
+		}
+
+		// Convert the UTF-16 surrogate pair if necessary.
+		// For simplicity in usage, and because the frequency of pairs is
+		// low, look both directions.
+		if (single <= LEAD_SURROGATE_MAX_VALUE) {
+			offset16++;
+			if (offset16 >= limit) {
+				return single;
+			}
+			char trail = source[offset16];
+			if (isTrailSurrogate(trail)) {
+				return UCharacterProperty.getRawSupplementary(single, trail);
+			}
+		} else { // isTrailSurrogate(single), so
+			if (offset16 == start) {
+				return single;
+			}
+			offset16--;
+			char lead = source[offset16];
+			if (isLeadSurrogate(lead))
+				return UCharacterProperty.getRawSupplementary(lead, single);
+		}
+		return single; // return unmatched surrogate
+	}
+
+	/**
+	 * Determines how many chars this char32 requires. If a validity check is
+	 * required, use <code>
+	 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
+	 * char32 before calling.
+	 * 
+	 * @param char32 the input codepoint.
+	 * @return 2 if is in supplementary space, otherwise 1.
+	 * @stable ICU 2.1
+	 */
+	public static int getCharCount(int char32) {
+		if (char32 < SUPPLEMENTARY_MIN_VALUE) {
+			return 1;
+		}
+		return 2;
+	}
+
+	/**
+	 * Determines whether the code value is a surrogate.
+	 * 
+	 * @param char16 the input character.
+	 * @return true if the input character is a surrogate.
+	 * @stable ICU 2.1
+	 */
+	public static boolean isSurrogate(char char16) {
+		return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
+	}
+
+	/**
+	 * Determines whether the character is a trail surrogate.
+	 * 
+	 * @param char16 the input character.
+	 * @return true if the input character is a trail surrogate.
+	 * @stable ICU 2.1
+	 */
+	public static boolean isTrailSurrogate(char char16) {
+		return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
+	}
+
+	/**
+	 * Determines whether the character is a lead surrogate.
+	 * 
+	 * @param char16 the input character.
+	 * @return true if the input character is a lead surrogate
+	 * @stable ICU 2.1
+	 */
+	public static boolean isLeadSurrogate(char char16) {
+		return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
+	}
+
+	/**
+	 * Returns the lead surrogate. If a validity check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
+	 * char32 before calling.
+	 * 
+	 * @param char32 the input character.
+	 * @return lead surrogate if the getCharCount(ch) is 2; <br>
+	 *         and 0 otherwise (note: 0 is not a valid lead surrogate).
+	 * @stable ICU 2.1
+	 */
+	public static char getLeadSurrogate(int char32) {
+		if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
+			return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
+		}
+
+		return 0;
+	}
+
+	/**
+	 * Returns the trail surrogate. If a validity check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
+	 * char32 before calling.
+	 * 
+	 * @param char32 the input character.
+	 * @return the trail surrogate if the getCharCount(ch) is 2; <br>
+	 *         otherwise the character itself
+	 * @stable ICU 2.1
+	 */
+	public static char getTrailSurrogate(int char32) {
+		if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
+			return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
+		}
+
+		return (char) char32;
+	}
+
+	/**
+	 * Convenience method corresponding to String.valueOf(char). Returns a one or
+	 * two char string containing the UTF-32 value in UTF16 format. If a validity
+	 * check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
+	 * char32 before calling.
+	 * 
+	 * @param char32 the input character.
+	 * @return string value of char32 in UTF16 format
+	 * @exception IllegalArgumentException thrown if char32 is a invalid codepoint.
+	 * @stable ICU 2.1
+	 */
+	public static String valueOf(int char32) {
+		if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
+			throw new IllegalArgumentException("Illegal codepoint");
+		}
+		return toString(char32);
+	}
+
+	/**
+	 * Append a single UTF-32 value to the end of a StringBuffer. If a validity
+	 * check is required, use
+	 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
+	 * char32 before calling.
+	 * 
+	 * @param target the buffer to append to
+	 * @param char32 value to append.
+	 * @return the updated StringBuffer
+	 * @exception IllegalArgumentException thrown when char32 does not lie within
+	 *                                     the range of the Unicode codepoints
+	 * @stable ICU 2.1
+	 */
+	public static StringBuffer append(StringBuffer target, int char32) {
+		// Check for irregular values
+		if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
+			throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
+		}
+
+		// Write the UTF-16 values
+		if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
+			target.append(getLeadSurrogate(char32));
+			target.append(getTrailSurrogate(char32));
+		} else {
+			target.append((char) char32);
+		}
+		return target;
+	}
+
+	/**
+	 * Shifts offset16 by the argument number of codepoints within a subarray.
+	 * 
+	 * @param source   char array
+	 * @param start    position of the subarray to be performed on
+	 * @param limit    position of the subarray to be performed on
+	 * @param offset16 UTF16 position to shift relative to start
+	 * @param shift32  number of codepoints to shift
+	 * @return new shifted offset16 relative to start
+	 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds
+	 *                                      with respect to the subarray or the
+	 *                                      subarray bounds are out of range.
+	 * @stable ICU 2.1
+	 */
+	public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) {
+		int size = source.length;
+		int count;
+		char ch;
+		int result = offset16 + start;
+		if (start < 0 || limit < start) {
+			throw new StringIndexOutOfBoundsException(start);
+		}
+		if (limit > size) {
+			throw new StringIndexOutOfBoundsException(limit);
+		}
+		if (offset16 < 0 || result > limit) {
+			throw new StringIndexOutOfBoundsException(offset16);
+		}
+		if (shift32 > 0) {
+			if (shift32 + result > size) {
+				throw new StringIndexOutOfBoundsException(result);
+			}
+			count = shift32;
+			while (result < limit && count > 0) {
+				ch = source[result];
+				if (isLeadSurrogate(ch) && (result + 1 < limit) && isTrailSurrogate(source[result + 1])) {
+					result++;
+				}
+				count--;
+				result++;
+			}
+		} else {
+			if (result + shift32 < start) {
+				throw new StringIndexOutOfBoundsException(result);
+			}
+			for (count = -shift32; count > 0; count--) {
+				result--;
+				if (result < start) {
+					break;
+				}
+				ch = source[result];
+				if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
+					result--;
+				}
+			}
+		}
+		if (count != 0) {
+			throw new StringIndexOutOfBoundsException(shift32);
+		}
+		result -= start;
+		return result;
+	}
+
+	// private data members -------------------------------------------------
+
+	/**
+	 * Shift value for lead surrogate to form a supplementary character.
+	 */
+	private static final int LEAD_SURROGATE_SHIFT_ = 10;
+
+	/**
+	 * Mask to retrieve the significant value from a trail surrogate.
+	 */
+	private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
+
+	/**
+	 * Value that all lead surrogate starts with
+	 */
+	private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
+			- (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+
+	// private methods ------------------------------------------------------
+
+	/**
+	 * <p>
+	 * Converts argument code point and returns a String object representing the
+	 * code point's value in UTF16 format.
+	 * <p>
+	 * This method does not check for the validity of the codepoint, the results are
+	 * not guaranteed if a invalid codepoint is passed as argument.
+	 * <p>
+	 * The result is a string whose length is 1 for non-supplementary code points, 2
+	 * otherwise.
+	 * 
+	 * @param ch code point
+	 * @return string representation of the code point
+	 */
+	private static String toString(int ch) {
+		if (ch < SUPPLEMENTARY_MIN_VALUE) {
+			return String.valueOf((char) ch);
+		}
+
+		StringBuilder result = new StringBuilder();
+		result.append(getLeadSurrogate(ch));
+		result.append(getTrailSurrogate(ch));
+		return result.toString();
+	}
+}