Skip to content

Commit 550130a

Browse files
committed
Prevent splitting surrogate pairs when diffing
See google#69 In this patch I'm trying to follow the approach taken by @judofyr but starting from the top of the script and going through, auditing every place that perform string operations that split, index, or otherwise operate on a character level so that we can make sure that we don't split surrogate pairs. This contrasts with [attempt one] where I created a custom iterator for strings. Surprisingly I found this more "ad-hoc" approach easier to manage since it doesn't create a split universe of string/Unicode. As of this commit I haven't audited the cleanup functions but my own tests are passing so I'm given to believe that they might be safe. I have my own doubts that this is sound work and that the middle-snake algorithm might find the wrong snake when presented with variable-width characters.
1 parent 62f2e68 commit 550130a

File tree

2 files changed

+163
-2
lines changed

2 files changed

+163
-2
lines changed

javascript/diff_match_patch_uncompressed.js

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,35 @@ diff_match_patch.Diff.prototype.toString = function() {
8888
return this[0] + ',' + this[1];
8989
};
9090

91+
diff_match_patch.prototype.isHighSurrogate = function(c) {
92+
var v = c.charCodeAt(0);
93+
return v >= 0xD800 && v <= 0xDBFF;
94+
}
95+
96+
diff_match_patch.prototype.isLowSurrogate = function(c) {
97+
var v = c.charCodeAt(0);
98+
return v >= 0xDC00 && v <= 0xDFFF;
99+
}
100+
101+
diff_match_patch.prototype.scalarValues = function(str) {
102+
var length = str.length;
103+
var scalars = [];
104+
105+
for (var i = 0; i < length; i++) {
106+
var scalar = str[i];
107+
108+
// proper surrogate pairs will come through as the whole scalar value
109+
// but if the pairs are broken they will be passed-through unaltered
110+
if (i < length - 1 && this.isHighSurrogate(scalar) && this.isLowSurrogate(str[i+1])) {
111+
scalar += str[i+1];
112+
i++;
113+
}
114+
115+
scalars.push(scalar);
116+
}
117+
118+
return scalars;
119+
}
91120

92121
/**
93122
* Find the differences between two texts. Simplifies the problem by stripping
@@ -134,12 +163,18 @@ diff_match_patch.prototype.diff_main = function(text1, text2, opt_checklines,
134163

135164
// Trim off common prefix (speedup).
136165
var commonlength = this.diff_commonPrefix(text1, text2);
166+
if (commonlength > 0 && this.isHighSurrogate(text1[commonlength - 1])) {
167+
commonlength--;
168+
}
137169
var commonprefix = text1.substring(0, commonlength);
138170
text1 = text1.substring(commonlength);
139171
text2 = text2.substring(commonlength);
140172

141173
// Trim off common suffix (speedup).
142174
commonlength = this.diff_commonSuffix(text1, text2);
175+
if (commonlength > 0 && this.isLowSurrogate(text1[text1.length - commonlength])) {
176+
commonlength--;
177+
}
143178
var commonsuffix = text1.substring(text1.length - commonlength);
144179
text1 = text1.substring(0, text1.length - commonlength);
145180
text2 = text2.substring(0, text2.length - commonlength);
@@ -187,13 +222,23 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,
187222

188223
var longtext = text1.length > text2.length ? text1 : text2;
189224
var shorttext = text1.length > text2.length ? text2 : text1;
225+
var shortlength = shorttext.length;
190226
var i = longtext.indexOf(shorttext);
191227
if (i != -1) {
228+
// skip leading unpaired surrogate
229+
if (this.isLowSurrogate(longtext[i])) {
230+
shortlength--;
231+
i++;
232+
}
233+
// skip trailing unpaired surrogate
234+
if (this.isHighSurrogate(longtext[i + shortlength])) {
235+
shortlength--;
236+
}
192237
// Shorter text is inside the longer text (speedup).
193238
diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
194239
new diff_match_patch.Diff(DIFF_EQUAL, shorttext),
195240
new diff_match_patch.Diff(DIFF_INSERT,
196-
longtext.substring(i + shorttext.length))];
241+
longtext.substring(i + shortlength))];
197242
// Swap insertions for deletions if diff is reversed.
198243
if (text1.length > text2.length) {
199244
diffs[0][0] = diffs[2][0] = DIFF_DELETE;
@@ -439,6 +484,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
439484
*/
440485
diff_match_patch.prototype.diff_bisectSplit_ = function(text1, text2, x, y,
441486
deadline) {
487+
// backup if we split a surrogate
488+
if (
489+
x > 0 && x < text1.length && this.isLowSurrogate(text1[x]) &&
490+
y > 0 && y < text2.length && this.isLowSurrogate(text2[y])
491+
) {
492+
x--;
493+
y--;
494+
}
495+
442496
var text1a = text1.substring(0, x);
443497
var text2a = text2.substring(0, y);
444498
var text1b = text1.substring(x);
@@ -569,6 +623,12 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) {
569623
}
570624
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
571625
}
626+
627+
// shorten the prefix if it splits a surrogate
628+
if (pointermid > 0 && this.isHighSurrogate(text1[pointermid-1])) {
629+
pointermid--;
630+
}
631+
572632
return pointermid;
573633
};
574634

@@ -601,6 +661,12 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) {
601661
}
602662
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
603663
}
664+
665+
// shorten the suffix if it splits a surrogate
666+
if (pointermid < length - 1 && this.isLowSurrogate(text1[pointermid])) {
667+
pointermid++;
668+
}
669+
604670
return pointermid;
605671
};
606672

@@ -749,6 +815,24 @@ diff_match_patch.prototype.diff_halfMatch_ = function(text1, text2) {
749815
text1_b = hm[3];
750816
}
751817
var mid_common = hm[4];
818+
819+
// move forward to prevent splitting a surrogate pair
820+
if (mid_common.length > 0 && this.isLowSurrogate(mid_common[0])) {
821+
text1_a = text1_a + mid_common[0];
822+
text2_a = text2_a + mid_common[0];
823+
mid_common = mid_common.substring(1);
824+
}
825+
826+
// back up to prevent splitting a surrogate pair
827+
if (
828+
text1_b.length > 0 && this.isLowSurrogate(text1_b[0]) &&
829+
text2_b.length > 0 && this.isLowSurrogate(text2_b[0])
830+
) {
831+
text1_b = mid_common[mid_common.length - 1] + text1_b;
832+
text2_b = mid_common[mid_common.length - 1] + text2_b;
833+
mid_common = mid_common.substring(0, -1);
834+
}
835+
752836
return [text1_a, text1_b, text2_a, text2_b, mid_common];
753837
};
754838

javascript/tests/diff_match_patch_test.js

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,83 @@ function testDiffDelta() {
492492
// Convert delta string into a diff.
493493
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
494494

495+
(function(){
496+
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
497+
U+1F18x 🆎
498+
0 1 2 3 4 5 6 7 8 9 A B C D E F
499+
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
500+
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
501+
U+1F21x 🈚
502+
U+1F22x 🈯
503+
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
504+
U+1F25x 🉐 🉑
505+
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
506+
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;
507+
508+
// applies some random edits to string and returns new, edited string
509+
function applyRandomTextEdit(text) {
510+
let textArr = [...text];
511+
let r = Math.random();
512+
if(r < 1/3) { // swap
513+
let swapCount = Math.floor(Math.random()*5);
514+
for(let i = 0; i < swapCount; i++) {
515+
let swapPos1 = Math.floor(Math.random()*textArr.length);
516+
let swapPos2 = Math.floor(Math.random()*textArr.length);
517+
let char1 = textArr[swapPos1];
518+
let char2 = textArr[swapPos2];
519+
textArr[swapPos1] = char2;
520+
textArr[swapPos2] = char1;
521+
}
522+
} else if(r < 2/3) { // remove
523+
let removeCount = Math.floor(Math.random()*5);
524+
for(let i = 0; i < removeCount; i++) {
525+
let removePos = Math.floor(Math.random()*textArr.length);
526+
textArr[removePos] = "";
527+
}
528+
} else { // add
529+
let addCount = Math.floor(Math.random()*5);
530+
for(let i = 0; i < addCount; i++) {
531+
let addPos = Math.floor(Math.random()*textArr.length);
532+
let addFromPos = Math.floor(Math.random()*textArr.length);
533+
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
534+
}
535+
}
536+
return textArr.join("");
537+
}
538+
539+
for(let i = 0; i < 1000; i++) {
540+
newText = applyRandomTextEdit(originalText);
541+
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
542+
}
543+
})();
544+
545+
// Unicode - splitting surrogates
546+
assertEquivalent(
547+
'+%F0%9F%85%B1\t=4',
548+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
549+
);
550+
551+
assertEquivalent(
552+
'-2\t=4',
553+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
554+
);
555+
556+
assertEquivalent(
557+
'=2\t-2\t=2',
558+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
559+
);
560+
561+
// Different versions of the library may have created deltas with
562+
// half of a surrogate pair encoded as if it were valid UTF-8
563+
try {
564+
assertEquivalent(
565+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
566+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
567+
);
568+
} catch ( e ) {
569+
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
570+
}
571+
495572
// Verify pool of unchanged characters.
496573
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
497574
var text2 = dmp.diff_text2(diffs);
@@ -963,4 +1040,4 @@ function testPatchApply() {
9631040
patches = dmp.patch_make('y', 'y123');
9641041
results = dmp.patch_apply(patches, 'x');
9651042
assertEquivalent(['x123', [true]], results);
966-
}
1043+
}

0 commit comments

Comments
 (0)