Prevent splitting surrogate pairs when diffing

dmsnell · dmsnell · commit 550130a13ebb · 2020-04-02T01:33:02.000-07:00
See google#69 In this patch I'm trying to follow the approach taken by @judofyr but starting from the top of the script and going through, auditing every place that perform string operations that split, index, or otherwise operate on a character level so that we can make sure that we don't split surrogate pairs. This contrasts with [attempt one] where I created a custom iterator for strings. Surprisingly I found this more "ad-hoc" approach easier to manage since it doesn't create a split universe of string/Unicode. As of this commit I haven't audited the cleanup functions but my own tests are passing so I'm given to believe that they might be safe. I have my own doubts that this is sound work and that the middle-snake algorithm might find the wrong snake when presented with variable-width characters.
diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js
@@ -88,6 +88,35 @@ diff_match_patch.Diff.prototype.toString = function() {
   return this[0] + ',' + this[1];
 };
 
+diff_match_patch.prototype.isHighSurrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xD800 && v <= 0xDBFF;
+}
+
+diff_match_patch.prototype.isLowSurrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xDC00 && v <= 0xDFFF;
+}
+
+diff_match_patch.prototype.scalarValues = function(str) {
+  var length = str.length;
+  var scalars = [];
+
+  for (var i = 0; i < length; i++) {
+    var scalar = str[i];
+
+    // proper surrogate pairs will come through as the whole scalar value
+    // but if the pairs are broken they will be passed-through unaltered
+    if (i < length - 1 && this.isHighSurrogate(scalar) && this.isLowSurrogate(str[i+1])) {
+      scalar += str[i+1];
+      i++;
+    }
+
+    scalars.push(scalar);
+  }
+
+  return scalars;
+}
 
 /**
  * Find the differences between two texts.  Simplifies the problem by stripping
@@ -134,12 +163,18 @@ diff_match_patch.prototype.diff_main = function(text1, text2, opt_checklines,
 
   // Trim off common prefix (speedup).
   var commonlength = this.diff_commonPrefix(text1, text2);
+  if (commonlength > 0 && this.isHighSurrogate(text1[commonlength - 1])) {
+    commonlength--;
+  }
   var commonprefix = text1.substring(0, commonlength);
   text1 = text1.substring(commonlength);
   text2 = text2.substring(commonlength);
 
   // Trim off common suffix (speedup).
   commonlength = this.diff_commonSuffix(text1, text2);
+  if (commonlength > 0 && this.isLowSurrogate(text1[text1.length - commonlength])) {
+    commonlength--;
+  }
   var commonsuffix = text1.substring(text1.length - commonlength);
   text1 = text1.substring(0, text1.length - commonlength);
   text2 = text2.substring(0, text2.length - commonlength);
@@ -187,13 +222,23 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,
 
   var longtext = text1.length > text2.length ? text1 : text2;
   var shorttext = text1.length > text2.length ? text2 : text1;
+  var shortlength = shorttext.length;
   var i = longtext.indexOf(shorttext);
   if (i != -1) {
+    // skip leading unpaired surrogate
+    if (this.isLowSurrogate(longtext[i])) {
+      shortlength--;
+      i++;
+    }
+    // skip trailing unpaired surrogate
+    if (this.isHighSurrogate(longtext[i + shortlength])) {
+      shortlength--;
+    }
     // Shorter text is inside the longer text (speedup).
     diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
              new diff_match_patch.Diff(DIFF_EQUAL, shorttext),
              new diff_match_patch.Diff(DIFF_INSERT,
-                 longtext.substring(i + shorttext.length))];
+                 longtext.substring(i + shortlength))];
     // Swap insertions for deletions if diff is reversed.
     if (text1.length > text2.length) {
       diffs[0][0] = diffs[2][0] = DIFF_DELETE;
@@ -439,6 +484,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
  */
 diff_match_patch.prototype.diff_bisectSplit_ = function(text1, text2, x, y,
     deadline) {
+  // backup if we split a surrogate
+  if (
+      x > 0 && x < text1.length && this.isLowSurrogate(text1[x]) &&
+      y > 0 && y < text2.length && this.isLowSurrogate(text2[y])
+  ) {
+    x--;
+    y--;
+  }
+
   var text1a = text1.substring(0, x);
   var text2a = text2.substring(0, y);
   var text1b = text1.substring(x);
@@ -569,6 +623,12 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) {
     }
     pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
   }
+
+  // shorten the prefix if it splits a surrogate
+  if (pointermid > 0 && this.isHighSurrogate(text1[pointermid-1])) {
+    pointermid--;
+  }
+
   return pointermid;
 };
 
@@ -601,6 +661,12 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) {
     }
     pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
   }
+
+  // shorten the suffix if it splits a surrogate
+  if (pointermid < length - 1 && this.isLowSurrogate(text1[pointermid])) {
+    pointermid++;
+  }
+
   return pointermid;
 };
 
@@ -749,6 +815,24 @@ diff_match_patch.prototype.diff_halfMatch_ = function(text1, text2) {
     text1_b = hm[3];
   }
   var mid_common = hm[4];
+
+  // move forward to prevent splitting a surrogate pair
+  if (mid_common.length > 0 && this.isLowSurrogate(mid_common[0])) {
+    text1_a = text1_a + mid_common[0];
+    text2_a = text2_a + mid_common[0];
+    mid_common = mid_common.substring(1);
+  }
+
+  // back up to prevent splitting a surrogate pair
+  if (
+    text1_b.length > 0 && this.isLowSurrogate(text1_b[0]) &&
+    text2_b.length > 0 && this.isLowSurrogate(text2_b[0])
+  ) {
+    text1_b = mid_common[mid_common.length - 1] + text1_b;
+    text2_b = mid_common[mid_common.length - 1] + text2_b;
+    mid_common = mid_common.substring(0, -1);
+  }
+
   return [text1_a, text1_b, text2_a, text2_b, mid_common];
 };
 
diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js
@@ -492,6 +492,83 @@ function testDiffDelta() {
   // Convert delta string into a diff.
   assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
 
+  (function(){
+    const originalText = `U+1F17x	🅰️	🅱️		🅾️	🅿️ safhawifhkw
+    U+1F18x															🆎	
+    0	1	2	3	4	5	6	7	8	9	A	B	C	D	E	F
+    U+1F19x		🆑	🆒	🆓	🆔	🆕	🆖	🆗	🆘	🆙	🆚					
+    U+1F20x		🈁	🈂️							sfss.,_||saavvvbbds						
+    U+1F21x	🈚					
+    U+1F22x			🈯
+    U+1F23x			🈲	🈳	🈴	🈵	🈶	🈷️	🈸	🈹	🈺					
+    U+1F25x	🉐	🉑		
+    U+1F30x	🌀	🌁	🌂	🌃	🌄	🌅	🌆	🌇	🌈	🌉	🌊	🌋	🌌	🌍	🌎	🌏
+    U+1F31x	🌐	🌑	🌒	🌓	🌔	🌕	🌖	🌗	🌘	🌙	🌚	🌛	🌜	🌝	🌞	`;
+
+    // applies some random edits to string and returns new, edited string
+    function applyRandomTextEdit(text) {
+      let textArr = [...text];
+      let r = Math.random();
+      if(r < 1/3) { // swap
+      let swapCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < swapCount; i++) {
+        let swapPos1 = Math.floor(Math.random()*textArr.length);
+          let swapPos2 = Math.floor(Math.random()*textArr.length);
+          let char1 = textArr[swapPos1];
+          let char2 = textArr[swapPos2];
+          textArr[swapPos1] = char2;
+          textArr[swapPos2] = char1;
+        }
+      } else if(r < 2/3) { // remove
+        let removeCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < removeCount; i++) {
+          let removePos = Math.floor(Math.random()*textArr.length);
+          textArr[removePos] = "";
+        }
+      } else { // add
+        let addCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < addCount; i++) {
+          let addPos = Math.floor(Math.random()*textArr.length);
+          let addFromPos = Math.floor(Math.random()*textArr.length);
+          textArr[addPos] = textArr[addPos] + textArr[addFromPos];
+        }
+      }
+      return textArr.join("");
+    }
+
+    for(let i = 0; i < 1000; i++) {
+      newText = applyRandomTextEdit(originalText);
+      dmp.diff_toDelta(dmp.diff_main(originalText, newText));
+    }
+  })();
+
+  // Unicode - splitting surrogates
+  assertEquivalent(
+    '+%F0%9F%85%B1\t=4',
+    dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
+  );
+
+  assertEquivalent(
+    '-2\t=4',
+    dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+  );
+
+  assertEquivalent(
+    '=2\t-2\t=2',
+    dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+  );
+
+  // Different versions of the library may have created deltas with
+  // half of a surrogate pair encoded as if it were valid UTF-8
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
+    );
+  } catch ( e ) {
+    assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
+  }
+
   // Verify pool of unchanged characters.
   diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
   var text2 = dmp.diff_text2(diffs);
@@ -963,4 +1040,4 @@ function testPatchApply() {
   patches = dmp.patch_make('y', 'y123');
   results = dmp.patch_apply(patches, 'x');
   assertEquivalent(['x123', [true]], results);
-}
+}