From 143821b8347adb2d280b77a08190288598b92d6b Mon Sep 17 00:00:00 2001
From: Lars Vogel <Lars.Vogel@vogella.com>
Date: Thu, 14 May 2026 06:14:19 +0200
Subject: [PATCH] Implement proper surrogate pair handling in JavaWordFinder

Replace the simplistic surrogate skip (from #2977) with full code-point
checking: when a surrogate char is encountered, form the code point from
the pair and test it with Character.isJavaIdentifierPart(int), so
identifiers containing supplementary Unicode characters are correctly
included in the word region.
---
 .../jdt/text/tests/JavaWordFinderTest.java    | 115 ++++++++++++++++++
 .../jdt/text/tests/JdtTextTestSuite.java      |   1 +
 .../jdt/internal/ui/text/JavaWordFinder.java  |  36 +++++-
 3 files changed, 148 insertions(+), 4 deletions(-)
 create mode 100644 org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java

diff --git a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java
new file mode 100644
index 00000000000..9afe6cde47d
--- /dev/null
+++ b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JavaWordFinderTest.java
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ * Copyright (c) 2026 vogella GmbH and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ *     Lars Vogel <Lars.Vogel@vogella.com> - initial API and implementation
+ *******************************************************************************/
+package org.eclipse.jdt.text.tests;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+import org.eclipse.jface.text.Document;
+import org.eclipse.jface.text.IDocument;
+import org.eclipse.jface.text.IRegion;
+import org.eclipse.jface.text.Region;
+
+import org.eclipse.jdt.internal.ui.text.JavaWordFinder;
+
+public class JavaWordFinderTest {
+
+	// U+10400 DESERET CAPITAL LETTER LONG I: a supplementary letter that is a
+	// valid Java identifier part, encoded as the surrogate pair 𐐀.
+	private static final String DESERET_A = "𐐀";
+
+	// U+1F600 GRINNING FACE: a supplementary character that is NOT a valid
+	// Java identifier part, encoded as 😀.
+	private static final String EMOJI = "😀";
+
+	private static IRegion findWord(String content, int offset) {
+		IDocument document = new Document(content);
+		return JavaWordFinder.findWord(document, offset);
+	}
+
+	// --- basic ASCII cases ---
+
+	@Test
+	public void testSimpleIdentifier_inside() {
+		assertEquals(new Region(2, 6), findWord("  foobar  ", 4));
+	}
+
+	@Test
+	public void testSimpleIdentifier_atStart() {
+		assertEquals(new Region(2, 6), findWord("  foobar  ", 2));
+	}
+
+	@Test
+	public void testSimpleIdentifier_atEnd() {
+		assertEquals(new Region(8, 0), findWord("  foobar  ", 8));
+	}
+
+	@Test
+	public void testNonIdentifier_returnsEmptyRegion() {
+		assertEquals(new Region(1, 0), findWord("  !!  ", 1));
+	}
+
+	// --- supplementary identifier characters (surrogate pairs) ---
+
+	@Test
+	public void testSupplementaryIdentifier_offsetOnHighSurrogate() {
+		// Document: " <DESERET_A>bc "
+		// Indices:   0  1  2  3  4  5
+		//            ' ' D8 DC 'b' 'c' ' '
+		// Word "𐐀bc" starts at index 1, length 4.
+		String content = " " + DESERET_A + "bc ";
+		assertEquals(new Region(1, 4), findWord(content, 1));
+	}
+
+	@Test
+	public void testSupplementaryIdentifier_offsetInsideAsciiPart() {
+		// Same document, offset on 'b' (index 3) -- word is still the full identifier.
+		String content = " " + DESERET_A + "bc ";
+		assertEquals(new Region(1, 4), findWord(content, 3));
+	}
+
+	@Test
+	public void testSupplementaryIdentifierAtEnd_offsetInAsciiPart() {
+		// Document: " abc<DESERET_A> "
+		// Indices:   0  1  2  3  4  5  6
+		//            ' ' 'a' 'b' 'c' D8 DC ' '
+		// Word "abc𐐀" starts at 1, length 5.
+		String content = " abc" + DESERET_A + " ";
+		assertEquals(new Region(1, 5), findWord(content, 2));
+	}
+
+	// --- non-identifier supplementary characters (emoji) ---
+
+	@Test
+	public void testNonIdentifierSurrogate_breaksWord() {
+		// Document: " <EMOJI>bc "
+		// The emoji is not a Java identifier part, so "bc" is its own word.
+		// offset on 'b' (index 3).
+		String content = " " + EMOJI + "bc ";
+		assertEquals(new Region(3, 2), findWord(content, 3));
+	}
+
+	@Test
+	public void testNonIdentifierSurrogate_betweenAsciiWords() {
+		// Document: "ab<EMOJI>cd"
+		// Indices:   0  1  2  3  4  5
+		//            'a' 'b' D8 DE 'c' 'd'
+		// Offset on 'c' (index 4): word is "cd" = Region(4, 2).
+		// Offset on 'b' (index 1): word is "ab" = Region(0, 2).
+		String content = "ab" + EMOJI + "cd";
+		assertEquals(new Region(4, 2), findWord(content, 4));
+		assertEquals(new Region(0, 2), findWord(content, 1));
+	}
+}
diff --git a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java
index a0e605fad1f..73f157b9c9b 100644
--- a/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java
+++ b/org.eclipse.jdt.text.tests/src/org/eclipse/jdt/text/tests/JdtTextTestSuite.java
@@ -64,6 +64,7 @@
 	IteratorForLoopJavaContextTest.class,
 	ArrayWithTempVarForLoopJavaContextTest.class,
 	JavaDoubleClickSelectorTest.class,
+	JavaWordFinderTest.class,
 	JavaStringDoubleClickStrategyTest.class,
 	BreakContinueTargetFinderTest.class,
 	EnumConstructorTargetFinderTest.class,
diff --git a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java
index 2ccfc82a5b1..3057f18f5f8 100644
--- a/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java
+++ b/org.eclipse.jdt.ui/ui/org/eclipse/jdt/internal/ui/text/JavaWordFinder.java
@@ -32,8 +32,22 @@ public static IRegion findWord(IDocument document, int offset) {
 
 			while (pos >= 0) {
 				c= document.getChar(pos);
-				if (!Character.isJavaIdentifierPart(c) && !Character.isSurrogate(c)) {
-					break;
+				if (!Character.isJavaIdentifierPart(c)) {
+					if (Character.isLowSurrogate(c) && pos > 0) {
+						char c2= document.getChar(pos - 1);
+						if (Character.isHighSurrogate(c2)) {
+							int codePoint= Character.toCodePoint(c2, c);
+							if (Character.isJavaIdentifierPart(codePoint)) {
+								pos--;
+							} else {
+								break;
+							}
+						} else {
+							break;
+						}
+					} else {
+						break;
+					}
 				}
 				--pos;
 			}
@@ -44,8 +58,22 @@ public static IRegion findWord(IDocument document, int offset) {
 
 			while (pos < length) {
 				c= document.getChar(pos);
-				if (!Character.isJavaIdentifierPart(c) && !Character.isSurrogate(c)) {
-					break;
+				if (!Character.isJavaIdentifierPart(c)) {
+					if (Character.isHighSurrogate(c) && pos + 1 < length) {
+						char c2= document.getChar(pos + 1);
+						if (Character.isLowSurrogate(c2)) {
+							int codePoint= Character.toCodePoint(c, c2);
+							if (Character.isJavaIdentifierPart(codePoint)) {
+								pos++;
+							} else {
+								break;
+							}
+						} else {
+							break;
+						}
+					} else {
+						break;
+					}
 				}
 				++pos;
 			}