diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java index 9abc2059b6a94..ff077912d2c28 100644 --- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java +++ b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java @@ -22,15 +22,14 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ - package build.tools.generatecharacter; -import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; +import java.util.Arrays; import java.util.stream.Collectors; -import java.util.stream.Stream; +import java.util.stream.IntStream; public class CaseFolding { @@ -42,32 +41,54 @@ public static void main(String[] args) throws Throwable { var templateFile = Paths.get(args[0]); var caseFoldingTxt = Paths.get(args[1]); var genSrcFile = Paths.get(args[2]); - var supportedTypes = "^.*; [CTS]; .*$"; + + // java.lang + var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding var caseFoldingEntries = Files.lines(caseFoldingTxt) - .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) - .map(line -> { - String[] cols = line.split("; "); - return new String[] {cols[0], cols[1], cols[2]}; - }) - .filter(cols -> { - // the folding case doesn't map back to the original char. - var cp1 = Integer.parseInt(cols[0], 16); - var cp2 = Integer.parseInt(cols[2], 16); - return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; - }) - .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) - .collect(Collectors.joining(",\n", "", "")); + .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)", + cp, + Arrays.stream(folding) + .mapToObj(f -> String.format("0x%04x", f)) + .collect(Collectors.joining(", ", "new int[] {", "}")) + ); + }) + .collect(Collectors.joining(",\n", "", "")); + // util.regex + var expandedSupportedTypes = "^.*; [CTS]; .*$"; + var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes)) + .map(line -> { + String[] cols = line.split("; "); + return new String[]{cols[0], cols[1], cols[2]}; + }) + .filter(cols -> { + // the folding case doesn't map back to the original char. + var cp1 = Integer.parseInt(cols[0], 16); + var cp2 = Integer.parseInt(cols[2], 16); + return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; + }) + .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) + .collect(Collectors.joining(",\n", "", "")); // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. // 0049; T; 0131; # LATIN CAPITAL LETTER I final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); - // Generate .java file Files.write( - genSrcFile, - Files.lines(templateFile) - .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) - .collect(Collectors.toList()), - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + genSrcFile, + Files.lines(templateFile) + .map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line) + .map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line) + .collect(Collectors.toList()), + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } } diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk index c05b126299b9e..d2f9f55ca393f 100644 --- a/make/modules/java.base/gensrc/GensrcCharacterData.gmk +++ b/make/modules/java.base/gensrc/GensrcCharacterData.gmk @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA) ################################################################################ + +GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java + +STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template +CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt + +$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT) + $(call LogInfo, Generating $@) + $(call MakeTargetDir) + $(TOOL_GENERATECASEFOLDING) \ + $(STRINGCASEFOLDING_TEMPLATE) \ + $(CASEFOLDINGTXT) \ + $(GENSRC_STRINGCASEFOLDING) + +TARGETS += $(GENSRC_STRINGCASEFOLDING) + + endif # include guard include MakeIncludeEnd.gmk diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk index a30f22b34d4bf..c46a029e2c255 100644 --- a/make/modules/java.base/gensrc/GensrcRegex.gmk +++ b/make/modules/java.base/gensrc/GensrcRegex.gmk @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK) ################################################################################ -GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java - -CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template -CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt - -$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT) - $(call LogInfo, Generating $@) - $(call MakeTargetDir) - $(TOOL_GENERATECASEFOLDING) \ - $(CASEFOLDINGTEMP) \ - $(CASEFOLDINGTXT) \ - $(GENSRC_CASEFOLDING) - -TARGETS += $(GENSRC_CASEFOLDING) - -################################################################################ - endif # include guard include MakeIncludeEnd.gmk diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index a18ac3250dc86..b78c02dffd91d 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -2180,6 +2180,7 @@ public boolean contentEquals(CharSequence cs) { * false} otherwise * * @see #equals(Object) + * @see #equalsFoldCase(String) * @see #codePoints() */ public boolean equalsIgnoreCase(String anotherString) { @@ -2189,6 +2190,56 @@ public boolean equalsIgnoreCase(String anotherString) { && regionMatches(true, 0, anotherString, 0, length()); } + /** + * Compares this {@code String} to another {@code String} for equality, + * using {@index "Unicode case folding"}. Two strings are considered equal + * by this method if their case-folded forms are identical. + *

+ * Case folding is defined by the Unicode Standard in + * CaseFolding.txt, + * including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")} + * returns {@code true}, since the character {@code U+00DF} (sharp s) folds + * to {@code "ss"}. + *

+ * Case folding is locale-independent and language-neutral, unlike + * locale-sensitive transformations such as {@link #toLowerCase()} or + * {@link #toUpperCase()}. It is intended for caseless matching, + * searching, and indexing. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #equalsIgnoreCase(String)}. It implements full case folding as + * defined by the Unicode Standard, which may differ from the simpler + * per-character mapping performed by {@code equalsIgnoreCase}. + * For example: + *

{@snippet lang=java :
+     * String a = "Fuß";
+     * String b = "FUSS";
+     * boolean equalsFoldCase = a.equalsFoldCase(b);       // returns true
+     * boolean equalsIgnoreCase = a.equalsIgnoreCase(b);   // returns false
+     * }
+ * + * @param anotherString + * The {@code String} to compare this {@code String} against + * + * @return {@code true} if the given object is not {@code null} and represents + * the same sequence of characters as this string under Unicode case + * folding; {@code false} otherwise. + * + * @see #compareToFoldCase(String) + * @see #equalsIgnoreCase(String) + * @since 26 + */ + public boolean equalsFoldCase(String anotherString) { + if (this == anotherString) { + return true; + } + if (anotherString == null) { + return false; + } + return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0; + } + /** * Compares two strings lexicographically. * The comparison is based on the Unicode value of each character in @@ -2304,12 +2355,84 @@ public int compare(String s1, String s2) { * than this String, ignoring case considerations. * @see java.text.Collator * @see #codePoints() + * @see #compareToFoldCase(String) * @since 1.2 */ public int compareToIgnoreCase(String str) { return CASE_INSENSITIVE_ORDER.compare(this, str); } + /** + * A Comparator that orders {@code String} objects as by + * {@link #compareToFoldCase(String) compareToFoldCase()}. + * + * @see #compareToFoldCase(String) + * @since 26 + */ + public static final Comparator UNICODE_CASEFOLD_ORDER + = new FoldCaseComparator(); + + private static class FoldCaseComparator implements Comparator { + + @Override + public int compare(String s1, String s2) { + byte[] v1 = s1.value; + byte[] v2 = s2.value; + if (s1.coder == s2.coder()) { + return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2) + : StringUTF16.compareToFC(v1, v2); + } + return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2) + : StringUTF16.compareToFC_Latin1(v1, v2); + } + } + + /** + * Compares two strings lexicographically using {@index "Unicode case folding"}. + * This method returns an integer whose sign is that of calling {@code compareTo} + * on the Unicode case folded version of the strings. Unicode Case folding + * eliminates differences in case according to the Unicode Standard, using the + * mappings defined in + * CaseFolding.txt, + * including 1:M mappings, such as {@code"ß"} → {@code }"ss"}. + *

+ * Case folding is a locale-independent, language-neutral form of case mapping, + * primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)}, + * which applies a simpler locale-insensitive uppercase mapping. This method + * follows the Unicode {@index "full"} case folding, providing stable and + * consistent results across all environments. + *

+ * Note that this method does not take locale into account, and may + * produce results that differ from locale-sensitive ordering. Use + * {@link java.text.Collator} for locale-sensitive comparison. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #compareToIgnoreCase(String)}. It implements the + * {@index "full case folding"} as defined by the Unicode Standard, which + * may differ from the simpler per-character mapping performed by + * {@code compareToIgnoreCase}. + * For example: + *

{@snippet lang=java :
+     * String a = "Fuß";
+     * String b = "FUSS";
+     * int cmpFoldCase = a.compareToFoldCase(b);     // returns 0
+     * int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
+     * }
+ * + * @param str the {@code String} to be compared. + * @return a negative integer, zero, or a positive integer as the specified + * String is greater than, equal to, or less than this String, + * ignoring case considerations by case folding. + * @see java.text.Collator + * @see #compareToIgnoreCase(String) + * @see #equalsFoldCase(String) + * @since 26 + */ + public int compareToFoldCase(String str) { + return UNICODE_CASEFOLD_ORDER.compare(this, str); + } + /** * Tests if two string regions are equal. *

diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java index 61c62d049bcf1..7b99db9dc93bf 100644 --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -32,6 +32,8 @@ import java.util.function.IntConsumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; + +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.IntrinsicCandidate; @@ -62,6 +64,10 @@ static int length(byte[] value) { return value.length; } + static int codePointAt(byte[] value, int index, int end) { + return value[index] & 0xff; + } + static char[] toChars(byte[] value) { char[] dst = new char[value.length]; inflate(value, 0, dst, 0, value.length); @@ -179,6 +185,140 @@ static int compareToCI_UTF16(byte[] value, byte[] other) { return len1 - len2; } + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = getChar(value, k1++); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = getChar(other, k2++); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + static int compareToFC(byte[] value, byte[] other) { + int len = value.length; + int olen = other.length; + int lim = Math.min(len, olen); + for (int k = 0; k < lim; k++) { + byte b1 = value[k]; + byte b2 = other[k]; + if (CharacterDataLatin1.equalsIgnoreCase(b1, b2)) { + continue; + } + int c1 = b1 & 0xff; + int c2 = b2 & 0xff; + if (c1 == 0xdf || c2 == 0xdf) { + return compareToFC0(value, k, len, other, k, olen); + } + return Character.toLowerCase(c1) - Character.toLowerCase(c2); + + } + return len - olen; + } + + private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = getChar(value, k1++); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = StringUTF16.codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + // latin1 vs utf16 + static int compareToFC_UTF16(byte[] value, byte[] other) { + int last = length(value); + int olast = StringUTF16.length(other); + int lim = Math.min(last, olast); + for (int k = 0; k < lim; k++) { + int cp1 = getChar(value, k); + int cp2 = StringUTF16.codePointAt(other, k, olast, true); + if (cp1 == cp2) { + continue; + } + int[] folded = CaseFolding.foldIfDefined(cp1); + if (folded != null) { + if (folded.length > 1) { + return compareToFC0_UTF16(value, k, last, other, k, olast); + } + cp1 = folded[0]; + } + folded = CaseFolding.foldIfDefined(cp2); + if (folded != null) { + if (folded.length > 1) { + return compareToFC0_UTF16(value, k, last, other, k, olast); + } + cp2 = folded[0]; + } + if (cp1 != cp2) { + return cp1 - cp2; + } + } + return last - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0); } diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java index 4e31c9728e9cd..f3bed250a0c4e 100644 --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -34,6 +34,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.misc.Unsafe; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.ForceInline; @@ -93,7 +94,7 @@ static int length(byte[] value) { return value.length >> 1; } - private static int codePointAt(byte[] value, int index, int end, boolean checked) { + static int codePointAt(byte[] value, int index, int end, boolean checked) { assert index < end; if (checked) { checkIndex(index, value); @@ -592,6 +593,71 @@ static int compareToCI_Latin1(byte[] value, byte[] other) { return -StringLatin1.compareToCI_UTF16(other, value); } + public static int compareToFC_Latin1(byte[] value, byte[] other) { + return -StringLatin1.compareToFC_UTF16(other, value); + } + + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int[] folded1 = null; + int[] folded2 = null; + int k1 = off, k2 = ooff, fk1 = 0, fk2 = 0; + while ((k1 < last || folded1 != null && fk1 < folded1.length) && + (k2 < olast || folded2 != null && fk2 < folded2.length)) { + int c1, c2; + if (folded1 != null && fk1 < folded1.length) { + c1 = folded1[fk1++]; + } else { + c1 = codePointAt(value, k1, last, true); + k1 += Character.charCount(c1); + folded1 = CaseFolding.foldIfDefined(c1); + fk1 = 0; + if (folded1 != null) { + c1 = folded1[fk1++]; + } + } + if (folded2 != null && fk2 < folded2.length) { + c2 = folded2[fk2++]; + } else { + c2 = codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + folded2 = CaseFolding.foldIfDefined(c2); + fk2 = 0; + if (folded2 != null) { + c2 = folded2[fk2++]; + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || folded1 != null && fk1 < folded1.length) { + return 1; + } + if (k2 < olast || folded2 != null && fk2 < folded2.length) { + return -1; + } + return 0; + } + + public static int compareToFC(byte[] value, byte[] other) { + int tlast = length(value); + int olast = length(other); + int k = 0; + while (k < tlast && k < olast) { + int cp1 = codePointAt(value, k, tlast, true); + int cp2 = codePointAt(other, k, olast, true); + if (cp1 == cp2) { + k += Character.charCount(cp1); + continue; + } + if (CaseFolding.isDefined(cp1) || CaseFolding.isDefined(cp2)) { + return compareToFC0(value, k, tlast, other, k, olast); + } + return cp1 - cp2; + } + return tlast - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0); } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 2908370acd551..8e50b66ca86e6 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -43,8 +43,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.java.lang.CaseFolding; import jdk.internal.util.ArraysSupport; -import jdk.internal.util.regex.CaseFolding; import jdk.internal.util.regex.Grapheme; /** diff --git a/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template new file mode 100644 index 0000000000000..d63cd040271f1 --- /dev/null +++ b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.java.lang; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static java.util.Map.entry; + +/** + * Utility class that handles Unicode case folding properties defined in + * CasingFolding.txt, including 1:M full case folding. + */ + public final class CaseFolding { + + private CaseFolding() {} + + /** + * Tests whether the specified code point is already in its case-folded form. + *

+ * A code point is considered folded if it does not have an explicit case + * folding mapping in the Unicode CaseFolding data. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has no case + * folding mapping (that is, it is already folded); + * {@code false} otherwise + * + * @see #fold(int) + */ + public static boolean isFolded(int cp) { + return caseFoldingMap.get(cp) == null; + } + + /** + * Tests whether the specified code point has a folding mapping entry defined. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has a case folding mapping entry + * defined in (@code caseFoldingMap}, {@code false} otherwise + */ + public static boolean isDefined(int cp) { + return caseFoldingMap.get(cp) != null; + } + + /** + * Returns the case-folded form of the specified code point, according + * to the Unicode case folding mappings. + *

+ * If the code point has no case folding mapping, this method returns + * the original code point as a single-element array. Otherwise, it + * returns the mapped form, which may consist of one or more {@code char} + * values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point + * + * @see #isFolded(int) + */ + public static int[] fold(int cp) { + var entry = caseFoldingMap.get(cp); + if (entry != null) + return entry.folding; + return new int[] { cp }; + } + + /** + * Returns the case-folded form of the specified code point defined + * by the Unicode case folding mappings. + *

+ * If the code point has no case folding mapping defined, this method + * returns null. Otherwise, it returns the mapped form, which may consist + * of one or more {@code char} values (to support 1:M mappings). + * + * @param cp + * the Unicode code point to fold + * @return an array of {@code char} values representing the + * case-folded form of the input code point, null if + * there is no mapping defined. + */ + public static int[] foldIfDefined(int cp) { + var entry = caseFoldingMap.get(cp); + return entry != null ? entry.folding : null; + } + + /** + * Returns a case-folded copy of the given {@code String} object, using the + * Unicode case folding mappings defined in + * + * Unicode Case Folding Properties. + *

+ * This is a convenience method intended primarily for testing + * {@link #isFolded(int)} and {@link #fold(int)}. Its implementation is + * not optimized for performance and should not be used in performance- + * sensitive contexts. It exists only until a dedicated + * {@code String.toCaseFold()} method is introduced. + * + * @param s + * the input string + * @return a {@code String} containing the case-folded form of the input string + */ + public static String fold(String s) { + int first; + int len = s.length(); + int cpCnt = 1; + for (first = 0; first < len; first += cpCnt) { + int cp = s.codePointAt(first); + if (!CaseFolding.isFolded(cp)) { + break; + } + cpCnt = Character.charCount(cp); + } + if (first == len) { + return s; + } + StringBuilder sb = new StringBuilder(len); + sb.append(s, 0, first); + for (int i = first; i < len; i += cpCnt) { + int cp = s.codePointAt(i); + if (CaseFolding.isFolded(cp)) { + sb.appendCodePoint(cp); + } else { + int[] folded = CaseFolding.fold(cp); + if (folded != null) { + for (int f : folded) { + sb.appendCodePoint(f); + } + } + } + cpCnt = Character.charCount(cp); + } + return sb.toString(); + } + + /** + * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive + * matching, according to the + * Simple Loose Matches + * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. + *

+ * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must + * be applied to literals and (optionally) to character classes. When applied to character classes, each + * character class is expected to be closed under simple case folding. See the standard for the + * detailed explanation and example of "closed". + *

+ * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should + *

    + *
  1. Provide at least the simple, default Unicode case-insensitive matching, and
  2. + *
  3. Specify which character properties or constructs are closed under the matching.
  4. + *
+ *

+ * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: + * back-refs, string slice (sequences), single, family(char-property) and class range. Single and + * family may appears independently or within a class. + *

+ * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and + * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for + * matching. + *

+ * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, + * if their behavior is clearly specified. + *

+ * This method addresses that requirement for the "range" construct within in character class by computing + * the additional characters that should be included to close the range under simple case folding: + *

+ * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple + * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped + * character is not already in the range, then that mapped character (typically lowercase) is added to + * the expansion set. + *

+ * This allows regex character class "range" implementation to use the returned expansion set to support + * additional case-insensitive matching, without duplicating characters already covered by the existing + * regex range implementation. The expectation is the matching is done using both the uppercase and + * lowercase forms of the input character, for example + * + *

{@code
+     *
+     *     ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
+     *           inRange(lower, Character.toLower(ch), upper) ||
+     *           additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
+     *           additionalClosingCharacters.contains(Character.toUpperCase(ch))
+     * }
+ * + *

+ * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches + * @param start the starting code point of the character range + * @param end the ending code point of the character range + * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding + * those already in the range + */ + public static int[] getClassRangeClosingCharacters(int start, int end) { + int[] expanded = new int[expanded_case_cps.length]; + int off = 0; + for (int cp : expanded_case_cps) { + if (cp >= start && cp <= end) { + int folding = expanded_case_map.get(cp); + if (folding < start || folding > end) { + expanded[off++] = folding; + } + } + } + return Arrays.copyOf(expanded, off); + } + + private static final Map expanded_case_map = Map.ofEntries( +%%%Expanded_Case_Map_Entries + ); + + private static final int[] expanded_case_cps = expanded_case_map.keySet() + .stream() + .mapToInt(Integer::intValue) + .toArray(); + + + private static class CaseFoldingEntry { + final int cp; + final int[] folding; + CaseFoldingEntry next = null; + + CaseFoldingEntry(int cp, int[] folding) { + this.cp = cp; + this.folding = folding; + }; + } + + private static class CaseFoldingMap { + + private final CaseFoldingEntry[] entries; + + CaseFoldingMap(CaseFoldingEntry[] entries) { + this.entries = new CaseFoldingEntry[entries.length << 1]; + for (var entry : entries) { + add(entry); + } + } + + public CaseFoldingEntry get(int cp) { + var entries = this.entries; + var index = cp % entries.length; + for (var e = entries[index]; e != null; e = e.next) { + if (e.cp == cp) + return e; + } + return null; + } + + public int getDepth(int cp) { + var entries = this.entries; + var index = cp % entries.length; + int depth = 0; + for (var e = entries[index]; e != null; e = e.next) { + if (e.cp == cp) + break; + depth++; + } + return depth; + } + + private void add(CaseFoldingEntry entry) { + int cp = entry.cp; + var entries = this.entries; + var index = cp % entries.length; + for (var e = entries[index]; e != null; e = e.next) { + if (e.cp == cp) { + throw new IllegalArgumentException("Duplicated mapping entry : " + + String.format("U+%04x", cp)); + } + } + entry.next = entries[index]; + entries[index] = entry; + } + } + + private final static CaseFoldingEntry[] caseFoldingEntries = { +%%%Entries + }; + + private final static CaseFoldingMap caseFoldingMap = new CaseFoldingMap(caseFoldingEntries); +} diff --git a/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template deleted file mode 100644 index 8ffbde6c535bb..0000000000000 --- a/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package jdk.internal.util.regex; - -import java.util.Arrays; -import java.util.Map; -import java.util.Objects; - -import static java.util.Map.entry; - -public final class CaseFolding { - - private static final Map expanded_case_map = Map.ofEntries( -%%%Entries - ); - - private static final int[] expanded_case_cps = expanded_case_map.keySet() - .stream() - .mapToInt(Integer::intValue) - .toArray(); - - private CaseFolding() {} - - /** - * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive - * matching, according to the - * Simple Loose Matches - * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. - *

- * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must - * be applied to literals and (optionally) to character classes. When applied to character classes, each - * character class is expected to be closed under simple case folding. See the standard for the - * detailed explanation and example of "closed". - *

- * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should - *

    - *
  1. Provide at least the simple, default Unicode case-insensitive matching, and
  2. - *
  3. Specify which character properties or constructs are closed under the matching.
  4. - *
- *

- * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: - * back-refs, string slice (sequences), single, family(char-property) and class range. Single and - * family may appears independently or within a class. - *

- * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and - * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for - * matching. - *

- * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, - * if their behavior is clearly specified. - *

- * This method addresses that requirement for the "range" construct within in character class by computing - * the additional characters that should be included to close the range under simple case folding: - *

- * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple - * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped - * character is not already in the range, then that mapped character (typically lowercase) is added to - * the expansion set. - *

- * This allows regex character class "range" implementation to use the returned expansion set to support - * additional case-insensitive matching, without duplicating characters already covered by the existing - * regex range implementation. The expectation is the matching is done using both the uppercase and - * lowercase forms of the input character, for example - * - *

{@code
-     *
-     *     ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
-     *           inRange(lower, Character.toLower(ch), upper) ||
-     *           additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
-     *           additionalClosingCharacters.contains(Character.toUpperCase(ch))
-     * }
- * - *

- * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches - * @param start the starting code point of the character range - * @param end the ending code point of the character range - * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding - * those already in the range - */ - public static int[] getClassRangeClosingCharacters(int start, int end) { - int[] expanded = new int[expanded_case_cps.length]; - int off = 0; - for (int cp : expanded_case_cps) { - if (cp >= start && cp <= end) { - int folding = expanded_case_map.get(cp); - if (folding < start || folding > end) { - expanded[off++] = folding; - } - } - } - return Arrays.copyOf(expanded, off); - } -} diff --git a/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java new file mode 100644 index 0000000000000..dd7d73af4e735 --- /dev/null +++ b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @summary tests unicode case-folding based String comparison and equality + * @bug 4397357 + * @library /lib/testlibrary/java/lang + * @compile --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED + * UnicodeCaseFoldingTest.java + * @run junit/othervm --add-exports java.base/jdk.internal.java.lang=ALL-UNNAMED + * UnicodeCaseFoldingTest + */ +import java.nio.file.Files; +import java.util.stream.Stream; +import java.util.stream.Collectors; +import java.util.ArrayList; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import jdk.internal.java.lang.CaseFolding; + +public class UnicodeCaseFoldingTest { + + @Test + void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable { + var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding + var results = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches(filter)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + var source = new String(Character.toChars(cp)); + var expected = new String(folding, 0, folding.length); + // (1) Verify the folding result matches expected + assertEquals(expected, CaseFolding.fold(source), "CaseFolding.fold()"); + + // (2) Verify compareToFoldCase() result + assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)"); + assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)"); + + // (3) Verify equalsFoldCase() result + assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)"); + assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)"); + return null; + }) + .filter(error -> error != null) + .toArray(); + assertEquals(0, results.length); + } + + @Test + void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable { + // S=simple, for simple case folding. The simple case folding should still matches + var filter = "^.*; [S]; .*$"; + var results = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches(filter)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + var source = new String(Character.toChars(cp)); + var expected = new String(folding, 0, folding.length); + + // (1) Verify compareToFoldCase() result + assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)"); + assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)"); + + // (2) Verify equalsFoldCase() result + assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)"); + assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)"); + return null; + }) + .filter(error -> error != null) + .toArray(); + assertEquals(0, results.length); + } + + @Test + public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception { + // Collect all code points that appear in CaseFolding.txt + var listed = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$")) + .map(line -> Integer.parseInt(line.split("; ")[0], 16)) + .collect(Collectors.toSet()); + + var failures = new ArrayList(); + + // Scan BMP + Supplementary Plane 1 (U+0000..U+1FFFF) + for (int cp = Character.MIN_CODE_POINT; cp <= 0x1FFFF; cp++) { + if (!Character.isDefined(cp)) { + continue; // skip undefined + } + if (Character.isSurrogate((char) cp)) { + continue; // skip surrogate code units + + } + if (listed.contains(cp)) { + continue; // already tested separately + } + String s = new String(Character.toChars(cp)); + String folded = CaseFolding.fold(s); + if (!s.equals(folded)) { + failures.add(String.format("Unexpected folding: U+%04X '%s' → '%s'", cp, s, folded)); + } + } + + assertEquals(0, failures.size(), + () -> "Some unlisted code points folded unexpectedly:\n" + + String.join("\n", failures)); + } + + @ParameterizedTest(name = "CaseFold \"{0}\" → \"{1}\"") + @MethodSource("caseFoldTestCases") + void testIndividualCaseFolding(String input, String expected) { + assertEquals(expected, CaseFolding.fold(input)); + } + + static Stream caseFoldTestCases() { + return Stream.of( + // ASCII simple cases + Arguments.of("ABC", "abc"), + Arguments.of("already", "already"), + Arguments.of("MiXeD123", "mixed123"), + // --- Latin-1 to non-Latin-1 fold --- + Arguments.of("aBc\u00B5Efg", "abc\u03BCefg"), // "µ" → "μ" + Arguments.of("test\u00B5\ud801\udc00X", "test\u03bc\ud801\udc28x"), + // German Eszett + Arguments.of("Stra\u00DFe", "strasse"), // "Straße" + Arguments.of("\u1E9E", "ss"), // "ẞ" capital sharp S + // Turkish dotted I / dotless i + Arguments.of("I", "i"), + Arguments.of("\u0130", "i\u0307"), // capital dotted I → "i + dot above" + Arguments.of("\u0069\u0307", "i\u0307"), // small i + dot above remains + Arguments.of("\u0131", "\u0131"), // "ı" (dotless i stays dotless) + + // Greek special cases --- + Arguments.of("\u039F\u03A3", "\u03BF\u03C3"), // "ΟΣ" → "οσ" final sigma always folds to normal sigma + Arguments.of("\u1F88", "\u1F00\u03B9"), // "ᾈ" → "ἀι" Alpha with psili + ypogegrammeni + Arguments.of("\u039C\u03AC\u03CA\u03BF\u03C2", "\u03BC\u03AC\u03CA\u03BF\u03C3"), // "Μάϊος" → "μάϊοσ" + Arguments.of("\u1F08", "\u1F00"), // Ἀ (Capital Alpha with psili) → ἀ + + // Supplementary Plane characters + Arguments.of("\uD801\uDC00", "\uD801\uDC28"), // Deseret Capital Letter Long I → Small + Arguments.of("\uD801\uDC01", "\uD801\uDC29"), // Deseret Capital Letter Long E → Small + + // Supplementary inside ASCII + Arguments.of("abc\uD801\uDC00def", "abc\uD801\uDC28def"), + // Ligatures and compatibility folds + Arguments.of("\uFB00", "ff"), // ff → ff + Arguments.of("\uFB03", "ffi"), // ffi → ffi + Arguments.of("\u212A", "k"), // Kelvin sign → k + + Arguments.of("abc\uFB00def", "abcffdef"), // ff → ff + Arguments.of("abc\uFB03def", "abcffidef"), // ffi → ffi + Arguments.of("abc\u212Adef", "abckdef"), // Kelvin sign → k + + // --- Fullwidth --- + Arguments.of("\uFF21\uFF22\uFF23", "\uFF41\uFF42\uFF43"), // "ABC" → "abc" + + // --- Armenian --- + Arguments.of("\u0531", "\u0561"), // "Ա" → "ա" + + // --- Cherokee --- + Arguments.of("\u13A0", "\u13A0"), // Capital Cherokee A folds to itself + Arguments.of("\uAB70", "\u13A0") // Small Cherokee A folds Capital Cherokee A + ); + } + + static Stream caseFoldEqualProvider() { + return Stream.of( + Arguments.of("abc", "ABC"), + Arguments.of("aBcDe", "AbCdE"), + Arguments.of("\u00C0\u00E7", "\u00E0\u00C7"), // Àç vs àÇ + Arguments.of("straße", "STRASSE"), // ß → ss + Arguments.of("\uD83C\uDDE6", "\uD83C\uDDE6"), // 🇦 vs 🇦 + Arguments.of("\u1E9E", "ss"), // ẞ (capital sharp S) + Arguments.of("\u03A3", "\u03C3"), // Σ vs σ (Greek Sigma) + Arguments.of("\u03C3", "\u03C2"), // σ vs ς (Greek sigma/final sigma) + Arguments.of("\u212B", "\u00E5"), // Å (Angstrom sign) vs å + Arguments.of("\uFB00", "ff"), // ff (ligature) + Arguments.of("\u01C5", "\u01C5"), // Dž (Latin capital D with small z with caron) + Arguments.of("Caf\u00E9", "CAF\u00C9"), // Café vs CAFÉ + Arguments.of("\u03BA\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\u03B1", "\u039A\u0391\u039B\u0397\u039C\u0388\u03A1\u0391"), // καλημέρα vs ΚΑΛΗΜΕΡΑ + Arguments.of("\u4E2D\u56FD", "\u4E2D\u56FD"), // 中国 + Arguments.of("\u03B1", "\u0391"), // α vs Α (Greek alpha) + Arguments.of("\u212B", "\u00C5"), // Å vs Å + // from StringCompareToIgnoreCase + Arguments.of("\u0100\u0102\u0104\u0106\u0108", "\u0100\u0102\u0104\u0106\u0109"), // ĀĂĄĆĈ vs ĀĂĄĆĉ + Arguments.of("\u0101\u0103\u0105\u0107\u0109", "\u0100\u0102\u0104\u0106\u0109"), // āăąćĉ vs ĀĂĄĆĉ + Arguments.of("\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04", + "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"), // 𐐀𐐁𐐂𐐃𐐄 vs 𐐀𐐁𐐂𐐃𐐬 + Arguments.of("\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c", + "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c") // 𐐨𐐩𐐪𐐫𐐬 vs 𐐀𐐁𐐂𐐃𐐬 + ); + } + + @ParameterizedTest + @MethodSource("caseFoldEqualProvider") + void testcompareToFoldCaseEquals(String s1, String s2) { + assertEquals(0, s1.compareToFoldCase(s2)); + assertEquals(0, s2.compareToFoldCase(s1)); + assertEquals(true, s1.equalsFoldCase(s2)); + assertEquals(true, s2.equalsFoldCase(s1)); + assertEquals(CaseFolding.fold(s1), CaseFolding.fold(s2)); + } + + static Stream caseFoldOrderingProvider() { + return Stream.of( + Arguments.of("asa", "aß", -1), // ß → ss → "asa" < "ass" + Arguments.of("aß", "asa", +1), + Arguments.of("a\u00DF", "ass", 0), // aß vs ass + Arguments.of("\uFB03", "ffi", 0), // ffi (ligature) + Arguments.of("\u00C5", "Z", 1), // Å vs Z + Arguments.of("A", "\u00C0", -1), // A vs À + Arguments.of("\u03A9", "\u03C9", 0), // Ω vs ω + Arguments.of("\u03C2", "\u03C3", 0), // ς vs σ + Arguments.of("\uD835\uDD23", "R", 1), // 𝔯 (fraktur r) vs R + Arguments.of("\uFF26", "E", 1), // F (full-width F) vs E + Arguments.of("\u00C9clair", "Eclair", 1) // Éclair vs Eclair + ); + } + + @ParameterizedTest + @MethodSource("caseFoldOrderingProvider") + void testcompareToFoldCaseOrdering(String s1, String s2, int expectedSign) { + int cmp = s1.compareToFoldCase(s2); + assertEquals(expectedSign, Integer.signum(cmp)); + } + + static Stream roundTripProvider() { + return Stream.of( + Arguments.of("abc"), + Arguments.of("ABC"), + Arguments.of("straße"), + Arguments.of("Àç"), + Arguments.of("aß"), + Arguments.of("\uFB02uff"), // fluff (ligature in "fluff") + Arguments.of("\u00C9COLE") // ÉCOLE + ); + } + + @ParameterizedTest + @MethodSource("roundTripProvider") + void testCaseFoldRoundTrip(String s) { + String folded = CaseFolding.fold(s); + assertEquals(0, s.compareToFoldCase(folded)); + assertEquals(0, folded.compareToFoldCase(s)); + assertEquals(true, s.equalsFoldCase(folded)); + assertEquals(true, folded.equalsFoldCase(s)); + } +} diff --git a/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java b/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java new file mode 100644 index 0000000000000..dff4d874705a7 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.*; +import java.util.concurrent.TimeUnit; + +/* + * This benchmark naively explores String::compareToFoldCase performance + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(3) +public class StringCompareToFoldCase { + + private String asciiUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + private String asciiUpperLower = "ABCDEFGHIJKLMNOpqrstuvwxyz"; + private String asciiLower = "abcdefghijklmnopqrstuvwxyz"; + + private String asciiWithDF = "abcdßßßßßßßßßßßßßßßßWXYZ"; + private String asciiWithDFSS = "abcdssssssssssssssssßßßßßßßßWXYZ"; + + private String asciiLatine1 = "ABCDEFGHIJKLMNOpqrstuvwxyz0"; + private String asciiLatin1UTF16 = "abcdefghijklmnopqrstuvwxyz\u0391"; + + private String greekUpper = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u0395"; // ΑΒΓΔΕ + private String greekUpperLower = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u03B5"; // ΑΒΓΔε + private String greekLower = "\u03B1\u03B2\u03B3\u03B4\u03B5\u03B1\u03B2\u03B3\u03B4\u03B5"; // αβγδε + + public String supUpper = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04"; + public String supUpperLower = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"; + public String supLower = "\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c"; + + @Benchmark + public int asciiUpperLower() { + return asciiUpper.compareToIgnoreCase(asciiUpperLower); + } + + @Benchmark + public int asciiLower() { + return asciiUpper.compareToIgnoreCase(asciiLower); + } + + @Benchmark + public int greekUpperLower() { + return greekUpper.compareToIgnoreCase(greekUpperLower); + } + + @Benchmark + public int greekLower() { + return greekUpper.compareToIgnoreCase(greekLower); + } + + @Benchmark + public int latin1UTF16() { + return asciiLatine1.compareToIgnoreCase(asciiLatin1UTF16); + } + + @Benchmark + public int supUpperLower() { + return supUpper.compareToIgnoreCase(supUpperLower); + } + + @Benchmark + public int supLower() { + return supUpper.compareToIgnoreCase(supLower); + } + + @Benchmark + public int asciiUpperLowerFC() { + return asciiUpper.compareToFoldCase(asciiUpperLower); + } + + @Benchmark + public int asciiLowerFC() { + return asciiUpper.compareToFoldCase(asciiLower); + } + + @Benchmark + public int asciiWithDFFC() { + return asciiWithDF.compareToFoldCase(asciiWithDFSS); + } + + @Benchmark + public int greekUpperLowerFC() { + return greekUpper.compareToFoldCase(greekUpperLower); + } + + @Benchmark + public int greekLowerFC() { + return greekUpper.compareToFoldCase(greekLower); + } + + @Benchmark + public int latin1UTF16FC() { + return asciiLatine1.compareToFoldCase(asciiLatin1UTF16); } + + @Benchmark + public int supUpperLowerFC() { + return supUpper.compareToFoldCase(supUpperLower); + } + + @Benchmark + public int supLowerFC() { + return supUpper.compareToFoldCase(supLower); + } + + @Benchmark + public boolean asciiUpperLowerEQ() { + return asciiUpper.equalsIgnoreCase(asciiUpperLower); + } + + @Benchmark + public boolean asciiLowerEQ() { + return asciiUpper.equalsIgnoreCase(asciiLower); + } + + @Benchmark + public boolean greekUpperLowerEQ() { + return greekUpper.equalsIgnoreCase(greekUpperLower); + } + + @Benchmark + public boolean greekLowerEQ() { + return greekUpper.equalsIgnoreCase(greekLower); + } + + @Benchmark + public boolean latin1UTF16EQ() { + return asciiLatine1.equalsIgnoreCase(asciiLatin1UTF16); + } + + @Benchmark + public boolean supUpperLowerEQ() { + return supUpper.equalsIgnoreCase(supUpperLower); + } + + @Benchmark + public boolean supLowerEQ() { + return supUpper.equalsIgnoreCase(supLower); + } + + @Benchmark + public boolean asciiUpperLowerEQFC() { + return asciiUpper.equalsFoldCase(asciiUpperLower); + } + + @Benchmark + public boolean asciiLowerEQFC() { + return asciiUpper.equalsFoldCase(asciiLower); + } + + @Benchmark + public boolean greekUpperLowerEQFC() { + return greekUpper.equalsFoldCase(greekUpperLower); + } + + @Benchmark + public boolean greekLowerEQFC() { + return greekUpper.equalsFoldCase(greekLower); + } + + @Benchmark + public boolean latin1UTF16EQFC() { + return asciiLatine1.equalsFoldCase(asciiLatin1UTF16); + } + + @Benchmark + public boolean supUpperLowerEQFC() { + return supUpper.equalsFoldCase(supUpperLower); + } + + @Benchmark + public boolean supLowerEQFC() { + return supUpper.equalsFoldCase(supLower); + } + }