Skip to content

Commit 116615b

Browse files
authored
Merge pull request #1 from dmsnell/issues/69-broken-surrogate-pairs
Stop breaking surrogate pairs in toDelta()/fromDelta()
2 parents 62f2e68 + 21aebb4 commit 116615b

File tree

11 files changed

+949
-76
lines changed

11 files changed

+949
-76
lines changed

java/src/name/fraser/neil/plaintext/diff_match_patch.java

+124-8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package name.fraser.neil.plaintext;
2020

2121
import java.io.UnsupportedEncodingException;
22+
import java.lang.Character;
2223
import java.net.URLDecoder;
2324
import java.net.URLEncoder;
2425
import java.util.*;
@@ -1429,7 +1430,31 @@ public int diff_levenshtein(List<Diff> diffs) {
14291430
*/
14301431
public String diff_toDelta(List<Diff> diffs) {
14311432
StringBuilder text = new StringBuilder();
1433+
char lastEnd = 0;
1434+
boolean isFirst = true;
14321435
for (Diff aDiff : diffs) {
1436+
if (aDiff.text.isEmpty()) {
1437+
continue;
1438+
}
1439+
1440+
char thisTop = aDiff.text.charAt(0);
1441+
char thisEnd = aDiff.text.charAt(aDiff.text.length() - 1);
1442+
1443+
if (Character.isHighSurrogate(thisEnd)) {
1444+
lastEnd = thisEnd;
1445+
aDiff.text = aDiff.text.substring(0, aDiff.text.length() - 1);
1446+
}
1447+
1448+
if (! isFirst && Character.isHighSurrogate(lastEnd) && Character.isLowSurrogate(thisTop)) {
1449+
aDiff.text = lastEnd + aDiff.text;
1450+
}
1451+
1452+
isFirst = false;
1453+
1454+
if ( aDiff.text.isEmpty() ) {
1455+
continue;
1456+
}
1457+
14331458
switch (aDiff.operation) {
14341459
case INSERT:
14351460
try {
@@ -1457,6 +1482,103 @@ public String diff_toDelta(List<Diff> diffs) {
14571482
return delta;
14581483
}
14591484

1485+
private int digit16(char b) throws IllegalArgumentException {
1486+
switch (b) {
1487+
case '0': return 0;
1488+
case '1': return 1;
1489+
case '2': return 2;
1490+
case '3': return 3;
1491+
case '4': return 4;
1492+
case '5': return 5;
1493+
case '6': return 6;
1494+
case '7': return 7;
1495+
case '8': return 8;
1496+
case '9': return 9;
1497+
case 'A': case 'a': return 10;
1498+
case 'B': case 'b': return 11;
1499+
case 'C': case 'c': return 12;
1500+
case 'D': case 'd': return 13;
1501+
case 'E': case 'e': return 14;
1502+
case 'F': case 'f': return 15;
1503+
default:
1504+
throw new IllegalArgumentException();
1505+
}
1506+
}
1507+
1508+
private String decodeURI(String text) throws IllegalArgumentException {
1509+
int i = 0;
1510+
StringBuilder decoded = new StringBuilder(text.length());
1511+
1512+
while (i < text.length()) {
1513+
if (text.charAt(i) != '%') {
1514+
decoded.append(text.charAt(i++));
1515+
continue;
1516+
}
1517+
1518+
// start a percent-sequence
1519+
int byte1 = (digit16(text.charAt(i + 1)) << 4) + digit16(text.charAt(i + 2));
1520+
if ((byte1 & 0x80) == 0) {
1521+
decoded.append(Character.toChars(byte1));
1522+
i += 3;
1523+
continue;
1524+
}
1525+
1526+
if ( text.charAt(i + 3) != '%') {
1527+
throw new IllegalArgumentException();
1528+
}
1529+
1530+
int byte2 = (digit16(text.charAt(i + 4)) << 4) + digit16(text.charAt(i + 5));
1531+
if ((byte2 & 0xC0) != 0x80) {
1532+
throw new IllegalArgumentException();
1533+
}
1534+
byte2 = byte2 & 0x3F;
1535+
if ((byte1 & 0xE0) == 0xC0) {
1536+
decoded.append(Character.toChars(((byte1 & 0x1F) << 6) | byte2));
1537+
i += 6;
1538+
continue;
1539+
}
1540+
1541+
if (text.charAt(i + 6) != '%') {
1542+
throw new IllegalArgumentException();
1543+
}
1544+
1545+
int byte3 = (digit16(text.charAt(i + 7)) << 4) + digit16(text.charAt(i + 8));
1546+
if ((byte3 & 0xC0) != 0x80) {
1547+
throw new IllegalArgumentException();
1548+
}
1549+
byte3 = byte3 & 0x3F;
1550+
if ((byte1 & 0xF0) == 0xE0) {
1551+
// unpaired surrogate are fine here
1552+
decoded.append(Character.toChars(((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3));
1553+
i += 9;
1554+
continue;
1555+
}
1556+
1557+
if (text.charAt(i + 9) != '%') {
1558+
throw new IllegalArgumentException();
1559+
}
1560+
1561+
int byte4 = (digit16(text.charAt(i + 10)) << 4) + digit16(text.charAt(i + 11));
1562+
if ((byte4 & 0xC0) != 0x80) {
1563+
throw new IllegalArgumentException();
1564+
}
1565+
byte4 = byte4 & 0x3F;
1566+
if ((byte1 & 0xF8) == 0xF0) {
1567+
int codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) | (byte3 << 0x06) | byte4;
1568+
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
1569+
decoded.append(Character.toChars((codePoint & 0xFFFF) >>> 10 & 0x3FF | 0xD800));
1570+
decoded.append(Character.toChars(0xDC00 | (codePoint & 0xFFFF) & 0x3FF));
1571+
i += 12;
1572+
continue;
1573+
}
1574+
}
1575+
1576+
throw new IllegalArgumentException();
1577+
}
1578+
1579+
return decoded.toString();
1580+
}
1581+
14601582
/**
14611583
* Given the original text1, and an encoded string which describes the
14621584
* operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1605,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
14831605
// decode would change all "+" to " "
14841606
param = param.replace("+", "%2B");
14851607
try {
1486-
param = URLDecoder.decode(param, "UTF-8");
1487-
} catch (UnsupportedEncodingException e) {
1488-
// Not likely on modern system.
1489-
throw new Error("This system does not support UTF-8.", e);
1608+
param = this.decodeURI(param);
14901609
} catch (IllegalArgumentException e) {
14911610
// Malformed URI sequence.
14921611
throw new IllegalArgumentException(
@@ -2269,10 +2388,7 @@ public List<Patch> patch_fromText(String textline)
22692388
line = text.getFirst().substring(1);
22702389
line = line.replace("+", "%2B"); // decode would change all "+" to " "
22712390
try {
2272-
line = URLDecoder.decode(line, "UTF-8");
2273-
} catch (UnsupportedEncodingException e) {
2274-
// Not likely on modern system.
2275-
throw new Error("This system does not support UTF-8.", e);
2391+
line = this.decodeURI(line);
22762392
} catch (IllegalArgumentException e) {
22772393
// Malformed URI sequence.
22782394
throw new IllegalArgumentException(

java/tests/name/fraser/neil/plaintext/diff_match_patch_test.java

+36
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,42 @@ public static void testDiffDelta() {
424424

425425
assertEquals("diff_fromDelta: Unicode.", diffs, dmp.diff_fromDelta(text1, delta));
426426

427+
diffs = diffList(new Diff(EQUAL, "\ud83d\ude4b\ud83d"), new Diff(INSERT, "\ude4c\ud83d"), new Diff(EQUAL, "\ude4b"));
428+
delta = dmp.diff_toDelta(diffs);
429+
assertEquals("diff_toDelta: Surrogate Pairs.", "=2\t+%F0%9F%99%8C\t=2", delta);
430+
431+
assertEquals(
432+
"diff_toDelta: insert surrogate pair between similar high surrogates",
433+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd70"), new Diff(EQUAL, "\ud83c\udd71"))),
434+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c\udd70\ud83c"), new Diff(INSERT, "\udd70\ud83c"), new Diff(EQUAL, "\udd71")))
435+
);
436+
437+
assertEquals(
438+
"diff_toDelta: swap surrogate pairs delete/insert",
439+
dmp.diff_toDelta(diffList(new Diff(DELETE, "\ud83c\udd70"), new Diff(INSERT, "\ud83c\udd71"))),
440+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(DELETE, "\udd70"), new Diff(INSERT, "\udd71")))
441+
);
442+
443+
assertEquals(
444+
"diff_toDelta: swap surrogate pairs insert/delete",
445+
dmp.diff_toDelta(diffList(new Diff(INSERT, "\ud83c\udd70"), new Diff(DELETE, "\ud83c\udd71"))),
446+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "\ud83c"), new Diff(INSERT, "\udd70"), new Diff(DELETE, "\udd71")))
447+
);
448+
449+
assertEquals(
450+
"diff_toDelta: empty diff groups",
451+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(DELETE, ""), new Diff(INSERT, "ghijk"))),
452+
dmp.diff_toDelta(diffList(new Diff(EQUAL, "abcdef"), new Diff(INSERT, "ghijk")))
453+
);
454+
455+
// Different versions of the library may have created deltas with
456+
// half of a surrogate pair encoded as if it were valid UTF-8
457+
assertEquals(
458+
"diff_toDelta: surrogate half encoded as UTF8",
459+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "-2\t+%F0%9F%85%B1")),
460+
dmp.diff_toDelta(dmp.diff_fromDelta("\ud83c\udd70", "=1\t-1\t+%ED%B5%B1"))
461+
);
462+
427463
// Verify pool of unchanged characters.
428464
diffs = diffList(new Diff(INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # "));
429465
String text2 = dmp.diff_text2(diffs);

0 commit comments

Comments
 (0)