Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 48 additions & 23 deletions make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package build.tools.generatecharacter;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.IntStream;

public class CaseFolding {

Expand All @@ -42,32 +41,58 @@ public static void main(String[] args) throws Throwable {
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";

// java.lang
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
var fields = line.split("; ");
var cp = Integer.parseInt(fields[0], 16);
fields = fields[2].trim().split(" ");
var folding = new int[fields.length];
for (int i = 0; i < folding.length; i++) {
folding[i] = Integer.parseInt(fields[i], 16);
}
var foldingChars = Arrays.stream(folding)
.mapToObj(Character::toChars)
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int) chars[i]))
.toArray();
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
cp,
Arrays.stream(foldingChars)
.mapToObj(c -> String.format("0x%04x", c))
.collect(Collectors.joining(", ", "new char[] {", "}"))
);
})
.collect(Collectors.joining(",\n", "", ""));
// util.regex
var expandedSupportedTypes = "^.*; [CTS]; .*$";
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[]{cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));

// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);

// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}
17 changes: 17 additions & 0 deletions make/modules/java.base/gensrc/GensrcCharacterData.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)

################################################################################


GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java

STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(STRINGCASEFOLDING_TEMPLATE) \
$(CASEFOLDINGTXT) \
$(GENSRC_STRINGCASEFOLDING)

TARGETS += $(GENSRC_STRINGCASEFOLDING)


endif # include guard
include MakeIncludeEnd.gmk
17 changes: 0 additions & 17 deletions make/modules/java.base/gensrc/GensrcRegex.gmk
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)

################################################################################

GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java

CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt

$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)

TARGETS += $(GENSRC_CASEFOLDING)

################################################################################

endif # include guard
include MakeIncludeEnd.gmk
120 changes: 120 additions & 0 deletions src/java.base/share/classes/java/lang/String.java
Original file line number Diff line number Diff line change
Expand Up @@ -2189,6 +2189,56 @@ public boolean equalsIgnoreCase(String anotherString) {
&& regionMatches(true, 0, anotherString, 0, length());
}

/**
* Compares this {@code String} to another {@code String} for equality,
* using <em>Unicode case folding</em>. Two strings are considered equal
* by this method if their case-folded forms are identical.
* <p>
* Case folding is defined by the Unicode Standard in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings. For example, {@code "Maße".equalsFoldCase("MASSE")}
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
* to {@code "ss"}.
* <p>
* Case folding is locale-independent and language-neutral, unlike
* locale-sensitive transformations such as {@link #toLowerCase()} or
* {@link #toUpperCase()}. It is intended for caseless matching,
* searching, and indexing.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
* defined by the Unicode Standard, which may differ from the simpler
* per-character mapping performed by {@code equalsIgnoreCase}.
* For example:
* <pre>{@snippet lang=java :
* String a = "Maße";
* String b = "MASSE";
* boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
* boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
* }</pre>
*
* @param anotherString
* The {@code String} to compare this {@code String} against
*
* @return {@code true} if the given object is not {@code null} and represents
* the same sequence of characters as this string under Unicode case
* folding; {@code false} otherwise.
*
* @see #compareToFoldCase(String)
* @see #equalsIgnoreCase(String)
* @since 26
*/
public boolean equalsFoldCase(String anotherString) {
if (this == anotherString) {
return true;
}
if (anotherString == null) {
return false;
}
return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
}

/**
* Compares two strings lexicographically.
* The comparison is based on the Unicode value of each character in
Expand Down Expand Up @@ -2310,6 +2360,76 @@ public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}

/**
* A Comparator that orders {@code String} objects as by
* {@link #compareToFoldCase(String) compareToFoldCase()}.
*
* @see #compareToFoldCase(String)
* @since 26
*/
public static final Comparator<String> UNICODE_CASEFOLD_ORDER
= new FoldCaseComparator();

private static class FoldCaseComparator implements Comparator<String> {

@Override
public int compare(String s1, String s2) {
byte[] v1 = s1.value;
byte[] v2 = s2.value;
if (s1.coder == s2.coder()) {
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
: StringUTF16.compareToFC(v1, v2);
}
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
: StringUTF16.compareToFC_Latin1(v1, v2);
}
}

/**
* Compares two strings lexicographically using <em>Unicode case folding</em>.
* This method returns an integer whose sign is that of calling {@code compareTo}
* on the Unicode case folded version of the strings. Unicode Case folding
* eliminates differences in case according to the Unicode Standard, using the
* mappings defined in
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
* <p>
* Case folding is a locale-independent, language-neutral form of case mapping,
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
* which applies a simpler locale-insensitive uppercase mapping. This method
* follows the Unicode <em>full</em> case folding, providing stable and
* consistent results across all environments.
* <p>
* Note that this method does <em>not</em> take locale into account, and may
* produce results that differ from locale-sensitive ordering. Use
* {@link java.text.Collator} for locale-sensitive comparison.
*
* @apiNote
* This method is the Unicode-compliant alternative to
* {@link #compareToIgnoreCase(String)}. It implements the <em>full</em> case folding
* as defined by the Unicode Standard, which may differ from the simpler
* per-character mapping performed by {@code compareToIgnoreCase}.
* For example:
* <pre>{@snippet lang=java :
* String a = "Maße";
* String b = "MASSE";
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
* }</pre>
*
* @param str the {@code String} to be compared.
* @return a negative integer, zero, or a positive integer as the specified
* String is greater than, equal to, or less than this String,
* ignoring case considerations by case folding.
* @see #equalsFoldCase(String)
* @see #compareToIgnoreCase(String)
* @see java.text.Collator
* @since 26
*/
public int compareToFoldCase(String str) {
return UNICODE_CASEFOLD_ORDER.compare(this, str);
}

/**
* Tests if two string regions are equal.
* <p>
Expand Down
100 changes: 100 additions & 0 deletions src/java.base/share/classes/java/lang/StringLatin1.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import jdk.internal.java.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.IntrinsicCandidate;

Expand Down Expand Up @@ -62,6 +64,10 @@ static int length(byte[] value) {
return value.length;
}

static int codePointAt(byte[] value, int index, int end) {
return value[index] & 0xff;
}

static char[] toChars(byte[] value) {
char[] dst = new char[value.length];
inflate(value, 0, dst, 0, value.length);
Expand Down Expand Up @@ -179,6 +185,100 @@ static int compareToCI_UTF16(byte[] value, byte[] other) {
return len1 - len2;
}

public static int compareToFC(byte[] value, byte[] other) {
int len1 = value.length;
int len2 = other.length;
char[] folded1 = null;
char[] folded2 = null;
int k1 = 0, k2 = 0, fk1 = 0, fk2 = 0;
while ((k1 < len1 || folded1 != null && fk1 < folded1.length) &&
(k2 < len2 || folded2 != null && fk2 < folded2.length)) {
char c1, c2;
if (folded1 != null && fk1 < folded1.length) {
c1 = folded1[fk1++];
} else {
int cp = codePointAt(value, k1++, len1); // no surrogate
folded1 = CaseFolding.foldIfDefined(cp);
fk1 = 0;
if (folded1 == null) {
c1 = (char)cp;
} else {
c1 = folded1[fk1++];
}
}
if (folded2 != null && fk2 < folded2.length) {
c2 = folded2[fk2++];
} else {
int cp = codePointAt(other, k2++, len2);
folded2 = CaseFolding.foldIfDefined(cp);
fk2 = 0;
if (folded2 == null) {
c2 = (char)cp;
} else {
c2 = folded2[fk2++];
}
}
if (c1 != c2) {
return c1 - c2;
}
}
if (k1 < len1 || folded1 != null && fk1 < folded1.length) {
return 1;
}
if (k2 < len2 || folded2 != null && fk2 < folded2.length) {
return -1;
}
return 0;
}

public static int compareToFC_UTF16(byte[] value, byte[] other) {
int len1 = value.length;
int len2 = StringUTF16.length(other);
char[] folded1 = null;
char[] folded2 = null;
int k1 = 0, k2 = 0, fk1 = 0, fk2 = 0;

while ((k1 < len1 || folded1 != null && fk1 < folded1.length) &&
(k2 < len2 || folded2 != null && fk2 < folded2.length)) {
char c1, c2;
if (folded1 != null && fk1 < folded1.length) {
c1 = folded1[fk1++];
} else {
int cp = codePointAt(value, k1++, len1);
folded1 = CaseFolding.foldIfDefined(cp);
fk1 = 0;
if (folded1 == null) {
c1 = (char)cp;
} else {
c1 = folded1[fk1++];
}
}
if (folded2 != null && fk2 < folded2.length) {
c2 = folded2[fk2++];
} else {
int cp = StringUTF16.codePointAt(other, k2, len2);
k2 += Character.charCount(cp);
folded2 = CaseFolding.foldIfDefined(cp);
fk2 = 0;
if (folded2 == null) {
c2 = (char)cp;
} else {
c2 = folded2[fk2++];
}
}
if (c1 != c2) {
return c1 - c2;
}
}
if (k1 < len1 || folded1 != null && fk1 < folded1.length) {
return 1;
}
if (k2 < len2 || folded2 != null && fk2 < folded2.length) {
return -1;
}
return 0;
}

static int hashCode(byte[] value) {
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
}
Expand Down
Loading