From 143821b8347adb2d280b77a08190288598b92d6b Mon Sep 17 00:00:00 2001 From: Lars Vogel Date: Thu, 14 May 2026 06:14:19 +0200 Subject: [PATCH] Implement proper surrogate pair handling in JavaWordFinder Replace the simplistic surrogate skip (from #2977) with full code-point checking: when a surrogate char is encountered, form the code point from the pair and test it with Character.isJavaIdentifierPart(int), so identifiers containing supplementary Unicode characters are correctly included in the word region. --- .../jdt/text/tests/JavaWordFinderTest.java | 115 ++++++++++++++++++ .../jdt/text/tests/JdtTextTestSuite.java | 1 + .../jdt/internal/ui/text/JavaWordFinder.java | 36 +++++- 3 files changed, 148 insertions(+), 4 deletions(-) create mode 100644 org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java diff --git a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java new file mode 100644 index 00000000000..9afe6cde47d --- /dev/null +++ b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java @@ -0,0 +1,115 @@ +/******************************************************************************* + * Copyright (c) 2026 vogella GmbH and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * + * Contributors: + * Lars Vogel - initial API and implementation + *******************************************************************************/ +package org.eclipse.jdt.text.tests; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import org.eclipse.jface.text.Document; +import org.eclipse.jface.text.IDocument; +import org.eclipse.jface.text.IRegion; +import org.eclipse.jface.text.Region; + +import org.eclipse.jdt.internal.ui.text.JavaWordFinder; + +public class JavaWordFinderTest { + + // U+10400 DESERET CAPITAL LETTER LONG I: a supplementary letter that is a + // valid Java identifier part, encoded as the surrogate pair 𐐀. + private static final String DESERET_A = "𐐀"; + + // U+1F600 GRINNING FACE: a supplementary character that is NOT a valid + // Java identifier part, encoded as 😀. + private static final String EMOJI = "😀"; + + private static IRegion findWord(String content, int offset) { + IDocument document = new Document(content); + return JavaWordFinder.findWord(document, offset); + } + + // --- basic ASCII cases --- + + @Test + public void testSimpleIdentifier_inside() { + assertEquals(new Region(2, 6), findWord(" foobar ", 4)); + } + + @Test + public void testSimpleIdentifier_atStart() { + assertEquals(new Region(2, 6), findWord(" foobar ", 2)); + } + + @Test + public void testSimpleIdentifier_atEnd() { + assertEquals(new Region(8, 0), findWord(" foobar ", 8)); + } + + @Test + public void testNonIdentifier_returnsEmptyRegion() { + assertEquals(new Region(1, 0), findWord(" !! ", 1)); + } + + // --- supplementary identifier characters (surrogate pairs) --- + + @Test + public void testSupplementaryIdentifier_offsetOnHighSurrogate() { + // Document: " bc " + // Indices: 0 1 2 3 4 5 + // ' ' D8 DC 'b' 'c' ' ' + // Word "𐐀bc" starts at index 1, length 4. + String content = " " + DESERET_A + "bc "; + assertEquals(new Region(1, 4), findWord(content, 1)); + } + + @Test + public void testSupplementaryIdentifier_offsetInsideAsciiPart() { + // Same document, offset on 'b' (index 3) -- word is still the full identifier. + String content = " " + DESERET_A + "bc "; + assertEquals(new Region(1, 4), findWord(content, 3)); + } + + @Test + public void testSupplementaryIdentifierAtEnd_offsetInAsciiPart() { + // Document: " abc " + // Indices: 0 1 2 3 4 5 6 + // ' ' 'a' 'b' 'c' D8 DC ' ' + // Word "abc𐐀" starts at 1, length 5. + String content = " abc" + DESERET_A + " "; + assertEquals(new Region(1, 5), findWord(content, 2)); + } + + // --- non-identifier supplementary characters (emoji) --- + + @Test + public void testNonIdentifierSurrogate_breaksWord() { + // Document: " bc " + // The emoji is not a Java identifier part, so "bc" is its own word. + // offset on 'b' (index 3). + String content = " " + EMOJI + "bc "; + assertEquals(new Region(3, 2), findWord(content, 3)); + } + + @Test + public void testNonIdentifierSurrogate_betweenAsciiWords() { + // Document: "abcd" + // Indices: 0 1 2 3 4 5 + // 'a' 'b' D8 DE 'c' 'd' + // Offset on 'c' (index 4): word is "cd" = Region(4, 2). + // Offset on 'b' (index 1): word is "ab" = Region(0, 2). + String content = "ab" + EMOJI + "cd"; + assertEquals(new Region(4, 2), findWord(content, 4)); + assertEquals(new Region(0, 2), findWord(content, 1)); + } +} diff --git a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java index a0e605fad1f..73f157b9c9b 100644 --- a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java +++ b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java @@ -64,6 +64,7 @@ IteratorForLoopJavaContextTest.class, ArrayWithTempVarForLoopJavaContextTest.class, JavaDoubleClickSelectorTest.class, + JavaWordFinderTest.class, JavaStringDoubleClickStrategyTest.class, BreakContinueTargetFinderTest.class, EnumConstructorTargetFinderTest.class, diff --git a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java index 2ccfc82a5b1..3057f18f5f8 100644 --- a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java +++ b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java @@ -32,8 +32,22 @@ public static IRegion findWord(IDocument document, int offset) { while (pos >= 0) { c= document.getChar(pos); - if (!Character.isJavaIdentifierPart(c) && !Character.isSurrogate(c)) { - break; + if (!Character.isJavaIdentifierPart(c)) { + if (Character.isLowSurrogate(c) && pos > 0) { + char c2= document.getChar(pos - 1); + if (Character.isHighSurrogate(c2)) { + int codePoint= Character.toCodePoint(c2, c); + if (Character.isJavaIdentifierPart(codePoint)) { + pos--; + } else { + break; + } + } else { + break; + } + } else { + break; + } } --pos; } @@ -44,8 +58,22 @@ public static IRegion findWord(IDocument document, int offset) { while (pos < length) { c= document.getChar(pos); - if (!Character.isJavaIdentifierPart(c) && !Character.isSurrogate(c)) { - break; + if (!Character.isJavaIdentifierPart(c)) { + if (Character.isHighSurrogate(c) && pos + 1 < length) { + char c2= document.getChar(pos + 1); + if (Character.isLowSurrogate(c2)) { + int codePoint= Character.toCodePoint(c, c2); + if (Character.isJavaIdentifierPart(codePoint)) { + pos++; + } else { + break; + } + } else { + break; + } + } else { + break; + } } ++pos; }