diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java index 9abc2059b6a94..ff077912d2c28 100644 --- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java +++ b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java @@ -22,15 +22,14 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ - package build.tools.generatecharacter; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.util.Arrays; import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.stream.IntStream; public class CaseFolding { @@ -42,32 +41,54 @@ public static void main(String[] args) throws Throwable { var templateFile = Paths.get(args[0]); var caseFoldingTxt = Paths.get(args[1]); var genSrcFile = Paths.get(args[2]); - var supportedTypes = "^.*; [CTS]; .*$"; + + // java.lang + var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding var caseFoldingEntries = Files.lines(caseFoldingTxt) - .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) - .map(line -> { - String[] cols = line.split("; "); - return new String[] {cols[0], cols[1], cols[2]}; - }) - .filter(cols -> { - // the folding case doesn't map back to the original char. - var cp1 = Integer.parseInt(cols[0], 16); - var cp2 = Integer.parseInt(cols[2], 16); - return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; - }) - .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) - .collect(Collectors.joining(",\n", "", "")); + .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)", + cp, + Arrays.stream(folding) + .mapToObj(f -> String.format("0x%04x", f)) + .collect(Collectors.joining(", ", "new int[] {", "}")) + ); + }) + .collect(Collectors.joining(",\n", "", "")); + // util.regex + var expandedSupportedTypes = "^.*; [CTS]; .*$"; + var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes)) + .map(line -> { + String[] cols = line.split("; "); + return new String[]{cols[0], cols[1], cols[2]}; + }) + .filter(cols -> { + // the folding case doesn't map back to the original char. + var cp1 = Integer.parseInt(cols[0], 16); + var cp2 = Integer.parseInt(cols[2], 16); + return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; + }) + .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) + .collect(Collectors.joining(",\n", "", "")); // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. // 0049; T; 0131; # LATIN CAPITAL LETTER I final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); - // Generate .java file Files.write( - genSrcFile, - Files.lines(templateFile) - .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) - .collect(Collectors.toList()), - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + genSrcFile, + Files.lines(templateFile) + .map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line) + .map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line) + .collect(Collectors.toList()), + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } } diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk index c05b126299b9e..d2f9f55ca393f 100644 --- a/make/modules/java.base/gensrc/GensrcCharacterData.gmk +++ b/make/modules/java.base/gensrc/GensrcCharacterData.gmk @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA) ################################################################################ + +GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java + +STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template +CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt + +$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT) + $(call LogInfo, Generating $@) + $(call MakeTargetDir) + $(TOOL_GENERATECASEFOLDING) \ + $(STRINGCASEFOLDING_TEMPLATE) \ + $(CASEFOLDINGTXT) \ + $(GENSRC_STRINGCASEFOLDING) + +TARGETS += $(GENSRC_STRINGCASEFOLDING) + + endif # include guard include MakeIncludeEnd.gmk diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk index a30f22b34d4bf..c46a029e2c255 100644 --- a/make/modules/java.base/gensrc/GensrcRegex.gmk +++ b/make/modules/java.base/gensrc/GensrcRegex.gmk @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK) ################################################################################ -GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java - -CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template -CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt - -$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT) - $(call LogInfo, Generating $@) - $(call MakeTargetDir) - $(TOOL_GENERATECASEFOLDING) \ - $(CASEFOLDINGTEMP) \ - $(CASEFOLDINGTXT) \ - $(GENSRC_CASEFOLDING) - -TARGETS += $(GENSRC_CASEFOLDING) - -################################################################################ - endif # include guard include MakeIncludeEnd.gmk diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index a18ac3250dc86..b78c02dffd91d 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -2180,6 +2180,7 @@ public boolean contentEquals(CharSequence cs) { * false} otherwise * * @see #equals(Object) + * @see #equalsFoldCase(String) * @see #codePoints() */ public boolean equalsIgnoreCase(String anotherString) { @@ -2189,6 +2190,56 @@ public boolean equalsIgnoreCase(String anotherString) { && regionMatches(true, 0, anotherString, 0, length()); } + /** + * Compares this {@code String} to another {@code String} for equality, + * using {@index "Unicode case folding"}. Two strings are considered equal + * by this method if their case-folded forms are identical. + *
+ * Case folding is defined by the Unicode Standard in + * CaseFolding.txt, + * including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")} + * returns {@code true}, since the character {@code U+00DF} (sharp s) folds + * to {@code "ss"}. + *
+ * Case folding is locale-independent and language-neutral, unlike + * locale-sensitive transformations such as {@link #toLowerCase()} or + * {@link #toUpperCase()}. It is intended for caseless matching, + * searching, and indexing. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #equalsIgnoreCase(String)}. It implements full case folding as + * defined by the Unicode Standard, which may differ from the simpler + * per-character mapping performed by {@code equalsIgnoreCase}. + * For example: + *
{@snippet lang=java : + * String a = "Fuß"; + * String b = "FUSS"; + * boolean equalsFoldCase = a.equalsFoldCase(b); // returns true + * boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false + * }+ * + * @param anotherString + * The {@code String} to compare this {@code String} against + * + * @return {@code true} if the given object is not {@code null} and represents + * the same sequence of characters as this string under Unicode case + * folding; {@code false} otherwise. + * + * @see #compareToFoldCase(String) + * @see #equalsIgnoreCase(String) + * @since 26 + */ + public boolean equalsFoldCase(String anotherString) { + if (this == anotherString) { + return true; + } + if (anotherString == null) { + return false; + } + return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0; + } + /** * Compares two strings lexicographically. * The comparison is based on the Unicode value of each character in @@ -2304,12 +2355,84 @@ public int compare(String s1, String s2) { * than this String, ignoring case considerations. * @see java.text.Collator * @see #codePoints() + * @see #compareToFoldCase(String) * @since 1.2 */ public int compareToIgnoreCase(String str) { return CASE_INSENSITIVE_ORDER.compare(this, str); } + /** + * A Comparator that orders {@code String} objects as by + * {@link #compareToFoldCase(String) compareToFoldCase()}. + * + * @see #compareToFoldCase(String) + * @since 26 + */ + public static final Comparator
+ * Case folding is a locale-independent, language-neutral form of case mapping, + * primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)}, + * which applies a simpler locale-insensitive uppercase mapping. This method + * follows the Unicode {@index "full"} case folding, providing stable and + * consistent results across all environments. + *
+ * Note that this method does not take locale into account, and may + * produce results that differ from locale-sensitive ordering. Use + * {@link java.text.Collator} for locale-sensitive comparison. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #compareToIgnoreCase(String)}. It implements the + * {@index "full case folding"} as defined by the Unicode Standard, which + * may differ from the simpler per-character mapping performed by + * {@code compareToIgnoreCase}. + * For example: + *
{@snippet lang=java : + * String a = "Fuß"; + * String b = "FUSS"; + * int cmpFoldCase = a.compareToFoldCase(b); // returns 0 + * int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0 + * }+ * + * @param str the {@code String} to be compared. + * @return a negative integer, zero, or a positive integer as the specified + * String is greater than, equal to, or less than this String, + * ignoring case considerations by case folding. + * @see java.text.Collator + * @see #compareToIgnoreCase(String) + * @see #equalsFoldCase(String) + * @since 26 + */ + public int compareToFoldCase(String str) { + return UNICODE_CASEFOLD_ORDER.compare(this, str); + } + /** * Tests if two string regions are equal. *
diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java index 61c62d049bcf1..7b99db9dc93bf 100644 --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -32,6 +32,8 @@ import java.util.function.IntConsumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; + +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.IntrinsicCandidate; @@ -62,6 +64,10 @@ static int length(byte[] value) { return value.length; } + static int codePointAt(byte[] value, int index, int end) { + return value[index] & 0xff; + } + static char[] toChars(byte[] value) { char[] dst = new char[value.length]; inflate(value, 0, dst, 0, value.length); @@ -179,6 +185,140 @@ static int compareToCI_UTF16(byte[] value, byte[] other) { return len1 - len2; } + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = getChar(value, k1++); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = getChar(other, k2++); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + static int compareToFC(byte[] value, byte[] other) { + int len = value.length; + int olen = other.length; + int lim = Math.min(len, olen); + for (int k = 0; k < lim; k++) { + byte b1 = value[k]; + byte b2 = other[k]; + if (CharacterDataLatin1.equalsIgnoreCase(b1, b2)) { + continue; + } + int c1 = b1 & 0xff; + int c2 = b2 & 0xff; + if (c1 == 0xdf || c2 == 0xdf) { + return compareToFC0(value, k, len, other, k, olen); + } + return Character.toLowerCase(c1) - Character.toLowerCase(c2); + + } + return len - olen; + } + + private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = getChar(value, k1++); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = StringUTF16.codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + // latin1 vs utf16 + static int compareToFC_UTF16(byte[] value, byte[] other) { + int last = length(value); + int olast = StringUTF16.length(other); + int lim = Math.min(last, olast); + for (int k = 0; k < lim; k++) { + int cp1 = getChar(value, k); + int cp2 = StringUTF16.codePointAt(other, k, olast, true); + if (cp1 == cp2) { + continue; + } + int[] folded = CaseFolding.foldIfDefined(cp1); + if (folded != null) { + if (folded.length > 1) { + return compareToFC0_UTF16(value, k, last, other, k, olast); + } + cp1 = folded[0]; + } + folded = CaseFolding.foldIfDefined(cp2); + if (folded != null) { + if (folded.length > 1) { + return compareToFC0_UTF16(value, k, last, other, k, olast); + } + cp2 = folded[0]; + } + if (cp1 != cp2) { + return cp1 - cp2; + } + } + return last - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0); } diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java index 4e31c9728e9cd..f3bed250a0c4e 100644 --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -34,6 +34,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.misc.Unsafe; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.ForceInline; @@ -93,7 +94,7 @@ static int length(byte[] value) { return value.length >> 1; } - private static int codePointAt(byte[] value, int index, int end, boolean checked) { + static int codePointAt(byte[] value, int index, int end, boolean checked) { assert index < end; if (checked) { checkIndex(index, value); @@ -592,6 +593,71 @@ static int compareToCI_Latin1(byte[] value, byte[] other) { return -StringLatin1.compareToCI_UTF16(other, value); } + public static int compareToFC_Latin1(byte[] value, byte[] other) { + return -StringLatin1.compareToFC_UTF16(other, value); + } + + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = codePointAt(value, k1, last, true); + k1 += Character.charCount(c1); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + public static int compareToFC(byte[] value, byte[] other) { + int tlast = length(value); + int olast = length(other); + int k = 0; + while (k < tlast && k < olast) { + int cp1 = codePointAt(value, k, tlast, true); + int cp2 = codePointAt(other, k, olast, true); + if (cp1 == cp2) { + k += Character.charCount(cp1); + continue; + } + if (CaseFolding.isDefined(cp1) || CaseFolding.isDefined(cp2)) { + return compareToFC0(value, k, tlast, other, k, olast); + } + return cp1 - cp2; + } + return tlast - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0); } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 2908370acd551..8e50b66ca86e6 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -43,8 +43,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; -import jdk.internal.util.regex.CaseFolding; import jdk.internal.util.regex.Grapheme; /** diff --git a/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template new file mode 100644 index 0000000000000..d63cd040271f1 --- /dev/null +++ b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.java.lang; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static java.util.Map.entry; + +/** + * Utility class that handles Unicode case folding properties defined in + * CasingFolding.txt, including 1:M full case folding. + */ + public final class CaseFolding { + + private CaseFolding() {} + + /** + * Tests whether the specified code point is already in its case-folded form. + *
+ * A code point is considered folded if it does not have an explicit case + * folding mapping in the Unicode CaseFolding data. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has no case + * folding mapping (that is, it is already folded); + * {@code false} otherwise + * + * @see #fold(int) + */ + public static boolean isFolded(int cp) { + return caseFoldingMap.get(cp) == null; + } + + /** + * Tests whether the specified code point has a folding mapping entry defined. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has a case folding mapping entry + * defined in (@code caseFoldingMap}, {@code false} otherwise + */ + public static boolean isDefined(int cp) { + return caseFoldingMap.get(cp) != null; + } + + /** + * Returns the case-folded form of the specified code point, according + * to the Unicode case folding mappings. + *
+ * If the code point has no case folding mapping, this method returns + * the original code point as a single-element array. Otherwise, it + * returns the mapped form, which may consist of one or more {@code char} + * values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point + * + * @see #isFolded(int) + */ + public static int[] fold(int cp) { + var entry = caseFoldingMap.get(cp); + if (entry != null) + return entry.folding; + return new int[] { cp }; + } + + /** + * Returns the case-folded form of the specified code point defined + * by the Unicode case folding mappings. + *
+ * If the code point has no case folding mapping defined, this method + * returns null. Otherwise, it returns the mapped form, which may consist + * of one or more {@code char} values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point, null if + * there is no mapping defined. + */ + public static int[] foldIfDefined(int cp) { + var entry = caseFoldingMap.get(cp); + return entry != null ? entry.folding : null; + } + + /** + * Returns a case-folded copy of the given {@code String} object, using the + * Unicode case folding mappings defined in + * + * Unicode Case Folding Properties. + *
+ * This is a convenience method intended primarily for testing + * {@link #isFolded(int)} and {@link #fold(int)}. Its implementation is + * not optimized for performance and should not be used in performance- + * sensitive contexts. It exists only until a dedicated + * {@code String.toCaseFold()} method is introduced. + * + * @param s + * the input string + * @return a {@code String} containing the case-folded form of the input string + */ + public static String fold(String s) { + int first; + int len = s.length(); + int cpCnt = 1; + for (first = 0; first < len; first += cpCnt) { + int cp = s.codePointAt(first); + if (!CaseFolding.isFolded(cp)) { + break; + } + cpCnt = Character.charCount(cp); + } + if (first == len) { + return s; + } + StringBuilder sb = new StringBuilder(len); + sb.append(s, 0, first); + for (int i = first; i < len; i += cpCnt) { + int cp = s.codePointAt(i); + if (CaseFolding.isFolded(cp)) { + sb.appendCodePoint(cp); + } else { + int[] folded = CaseFolding.fold(cp); + if (folded != null) { + for (int f : folded) { + sb.appendCodePoint(f); + } + } + } + cpCnt = Character.charCount(cp); + } + return sb.toString(); + } + + /** + * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive + * matching, according to the + * Simple Loose Matches + * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. + *
+ * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must + * be applied to literals and (optionally) to character classes. When applied to character classes, each + * character class is expected to be closed under simple case folding. See the standard for the + * detailed explanation and example of "closed". + *
+ * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should + *
+ * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: + * back-refs, string slice (sequences), single, family(char-property) and class range. Single and + * family may appears independently or within a class. + *
+ * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and + * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for + * matching. + *
+ * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, + * if their behavior is clearly specified. + *
+ * This method addresses that requirement for the "range" construct within in character class by computing + * the additional characters that should be included to close the range under simple case folding: + *
+ * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple + * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped + * character is not already in the range, then that mapped character (typically lowercase) is added to + * the expansion set. + *
+ * This allows regex character class "range" implementation to use the returned expansion set to support + * additional case-insensitive matching, without duplicating characters already covered by the existing + * regex range implementation. The expectation is the matching is done using both the uppercase and + * lowercase forms of the input character, for example + * + *
{@code + * + * ch -> inRange(lower, Character.toUpperCase(ch), upper) || + * inRange(lower, Character.toLower(ch), upper) || + * additionalClosingCharacters.contains(Character.toUpperCase(ch)) || + * additionalClosingCharacters.contains(Character.toUpperCase(ch)) + * }+ * + *
+ * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
+ * @param start the starting code point of the character range
+ * @param end the ending code point of the character range
+ * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
+ * those already in the range
+ */
+ public static int[] getClassRangeClosingCharacters(int start, int end) {
+ int[] expanded = new int[expanded_case_cps.length];
+ int off = 0;
+ for (int cp : expanded_case_cps) {
+ if (cp >= start && cp <= end) {
+ int folding = expanded_case_map.get(cp);
+ if (folding < start || folding > end) {
+ expanded[off++] = folding;
+ }
+ }
+ }
+ return Arrays.copyOf(expanded, off);
+ }
+
+ private static final Map
- * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
- * be applied to literals and (optionally) to character classes. When applied to character classes, each
- * character class is expected to be closed under simple case folding. See the standard for the
- * detailed explanation and example of "closed".
- *
- * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
- *
- * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
- * back-refs, string slice (sequences), single, family(char-property) and class range. Single and
- * family may appears independently or within a class.
- *
- * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
- * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
- * matching.
- *
- * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
- * if their behavior is clearly specified.
- *
- * This method addresses that requirement for the "range" construct within in character class by computing
- * the additional characters that should be included to close the range under simple case folding:
- *
- * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
- * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
- * character is not already in the range, then that mapped character (typically lowercase) is added to
- * the expansion set.
- *
- * This allows regex character class "range" implementation to use the returned expansion set to support
- * additional case-insensitive matching, without duplicating characters already covered by the existing
- * regex range implementation. The expectation is the matching is done using both the uppercase and
- * lowercase forms of the input character, for example
- *
- *
- * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
- * @param start the starting code point of the character range
- * @param end the ending code point of the character range
- * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
- * those already in the range
- */
- public static int[] getClassRangeClosingCharacters(int start, int end) {
- int[] expanded = new int[expanded_case_cps.length];
- int off = 0;
- for (int cp : expanded_case_cps) {
- if (cp >= start && cp <= end) {
- int folding = expanded_case_map.get(cp);
- if (folding < start || folding > end) {
- expanded[off++] = folding;
- }
- }
- }
- return Arrays.copyOf(expanded, off);
- }
-}
diff --git a/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
new file mode 100644
index 0000000000000..dd7d73af4e735
--- /dev/null
+++ b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @summary tests unicode case-folding based String comparison and equality
+ * @bug 4397357
+ * @library /lib/testlibrary/java/lang
+ * @compile --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED
+ * UnicodeCaseFoldingTest.java
+ * @run junit/othervm --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED
+ * UnicodeCaseFoldingTest
+ */
+import java.nio.file.Files;
+import java.util.stream.Stream;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import jdk.internal.java.lang.CaseFolding;
+
+public class UnicodeCaseFoldingTest {
+
+ @Test
+ void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+ // (1) Verify the folding result matches expected
+ assertEquals(expected, CaseFolding.fold(source), "CaseFolding.fold()");
+
+ // (2) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (3) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ // S=simple, for simple case folding. The simple case folding should still matches
+ var filter = "^.*; [S]; .*$";
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+
+ // (1) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (2) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
+ // Collect all code points that appear in CaseFolding.txt
+ var listed = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
+ .map(line -> Integer.parseInt(line.split("; ")[0], 16))
+ .collect(Collectors.toSet());
+
+ var failures = new ArrayList
- *
- * {@code
- *
- * ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
- * inRange(lower, Character.toLower(ch), upper) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch))
- * }
- *
- *