Put escape sequences into separate token

rlidwka · rlidwka · commit 75037c6514e9 · 2022-04-15T20:02:47.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [13.0.0] - WIP
+### Added
+- Added a new token type `text_special` to store escaped characters, same as `text` but
+  unaffected by replacement plugins (smartquotes, typographer, linkifier, etc.).
+- Added a new rule `text_join` in `core` ruler. Text replacement plugins may choose to
+  insert themselves before it.
+
+### Changed
+- `text_collapse` rule is renamed to `fragments_join`.
+
+### Fixed
+- Smartquotes, typographic replacements and plain text links can now be escaped
+  with backslash (e.g. `\(c)` or `google\.com` are no longer replaced).
+
+
 ## [12.3.2] - 2022-01-08
 ### Security
 - Fix possible ReDOS in newline rule. Thanks to @MakeNowJust.
@@ -592,6 +608,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Renamed presets folder (configs -> presets).
 
 
+[13.0.0]: https://github.com/markdown-it/markdown-it/compare/12.3.2...13.0.0
 [12.3.2]: https://github.com/markdown-it/markdown-it/compare/12.3.1...12.3.2
 [12.3.1]: https://github.com/markdown-it/markdown-it/compare/12.3.0...12.3.1
 [12.3.0]: https://github.com/markdown-it/markdown-it/compare/12.2.0...12.3.0
diff --git a/lib/parser_core.js b/lib/parser_core.js
@@ -16,7 +16,10 @@ var _rules = [
   [ 'inline',         require('./rules_core/inline')         ],
   [ 'linkify',        require('./rules_core/linkify')        ],
   [ 'replacements',   require('./rules_core/replacements')   ],
-  [ 'smartquotes',    require('./rules_core/smartquotes')    ]
+  [ 'smartquotes',    require('./rules_core/smartquotes')    ],
+  // `text_join` finds `text_special` tokens (for escape sequences)
+  // and joins them with the rest of the text
+  [ 'text_join',      require('./rules_core/text_join')      ]
 ];
 
 
diff --git a/lib/parser_inline.js b/lib/parser_inline.js
@@ -26,11 +26,18 @@ var _rules = [
   [ 'entity',          require('./rules_inline/entity') ]
 ];
 
+// `rule2` ruleset was created specifically for emphasis/strikethrough
+// post-processing and may be changed in the future.
+//
+// Don't use this for anything except pairs (plugins working with `balance_pairs`).
+//
 var _rules2 = [
   [ 'balance_pairs',   require('./rules_inline/balance_pairs') ],
   [ 'strikethrough',   require('./rules_inline/strikethrough').postProcess ],
   [ 'emphasis',        require('./rules_inline/emphasis').postProcess ],
-  [ 'text_collapse',   require('./rules_inline/text_collapse') ]
+  // rules for pairs separate '**' into its own text tokens, which may be left unused,
+  // rule below merges unused segments back with the rest of the text
+  [ 'fragments_join',  require('./rules_inline/fragments_join') ]
 ];
 
 
diff --git a/lib/presets/commonmark.js b/lib/presets/commonmark.js
@@ -38,7 +38,8 @@ module.exports = {
       rules: [
         'normalize',
         'block',
-        'inline'
+        'inline',
+        'text_join'
       ]
     },
 
@@ -73,7 +74,7 @@ module.exports = {
       rules2: [
         'balance_pairs',
         'emphasis',
-        'text_collapse'
+        'fragments_join'
       ]
     }
   }
diff --git a/lib/presets/zero.js b/lib/presets/zero.js
@@ -39,7 +39,8 @@ module.exports = {
       rules: [
         'normalize',
         'block',
-        'inline'
+        'inline',
+        'text_join'
       ]
     },
 
@@ -55,7 +56,7 @@ module.exports = {
       ],
       rules2: [
         'balance_pairs',
-        'text_collapse'
+        'fragments_join'
       ]
     }
   }
diff --git a/lib/rules_core/text_join.js b/lib/rules_core/text_join.js
@@ -0,0 +1,45 @@
+// Join raw text tokens with the rest of the text
+//
+// This is set as a separate rule to provide an opportunity for plugins
+// to run text replacements after text join, but before escape join.
+//
+// For example, `\:)` shouldn't be replaced with an emoji.
+//
+'use strict';
+
+
+module.exports = function text_join(state) {
+  var j, l, tokens, curr, max, last,
+      blockTokens = state.tokens;
+
+  for (j = 0, l = blockTokens.length; j < l; j++) {
+    if (blockTokens[j].type !== 'inline') continue;
+
+    tokens = blockTokens[j].children;
+    max = tokens.length;
+
+    for (curr = 0; curr < max; curr++) {
+      if (tokens[curr].type === 'text_special') {
+        tokens[curr].type = 'text';
+      }
+    }
+
+    for (curr = last = 0; curr < max; curr++) {
+      if (tokens[curr].type === 'text' &&
+          curr + 1 < max &&
+          tokens[curr + 1].type === 'text') {
+
+        // collapse two adjacent text nodes
+        tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
+      } else {
+        if (curr !== last) { tokens[last] = tokens[curr]; }
+
+        last++;
+      }
+    }
+
+    if (curr !== last) {
+      tokens.length = last;
+    }
+  }
+};
diff --git a/lib/rules_inline/escape.js b/lib/rules_inline/escape.js
@@ -13,40 +13,59 @@ for (var i = 0; i < 256; i++) { ESCAPED.push(0); }
 
 
 module.exports = function escape(state, silent) {
-  var ch, pos = state.pos, max = state.posMax;
-
-  if (state.src.charCodeAt(pos) !== 0x5C/* \ */) { return false; }
+  var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax;
 
+  if (state.src.charCodeAt(pos) !== 0x5C/* \ */) return false;
   pos++;
 
-  if (pos < max) {
-    ch = state.src.charCodeAt(pos);
+  // '\' at the end of the inline block
+  if (pos >= max) return false;
+
+  ch1 = state.src.charCodeAt(pos);
 
-    if (ch < 256 && ESCAPED[ch] !== 0) {
-      if (!silent) { state.pending += state.src[pos]; }
-      state.pos += 2;
-      return true;
+  if (ch1 === 0x0A) {
+    if (!silent) {
+      state.push('hardbreak', 'br', 0);
     }
 
-    if (ch === 0x0A) {
-      if (!silent) {
-        state.push('hardbreak', 'br', 0);
-      }
+    pos++;
+    // skip leading whitespaces from next line
+    while (pos < max) {
+      ch1 = state.src.charCodeAt(pos);
+      if (!isSpace(ch1)) break;
+      pos++;
+    }
+
+    state.pos = pos;
+    return true;
+  }
+
+  escapedStr = state.src[pos];
 
+  if (ch1 >= 0xD800 && ch1 <= 0xDBFF && pos + 1 < max) {
+    ch2 = state.src.charCodeAt(pos + 1);
+
+    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+      escapedStr += state.src[pos + 1];
       pos++;
-      // skip leading whitespaces from next line
-      while (pos < max) {
-        ch = state.src.charCodeAt(pos);
-        if (!isSpace(ch)) { break; }
-        pos++;
-      }
-
-      state.pos = pos;
-      return true;
     }
   }
 
-  if (!silent) { state.pending += '\\'; }
-  state.pos++;
+  origStr = '\\' + escapedStr;
+
+  if (!silent) {
+    token = state.push('text_special', '', 0);
+
+    if (ch1 < 256 && ESCAPED[ch1] !== 0) {
+      token.content = escapedStr;
+    } else {
+      token.content = origStr;
+    }
+
+    token.markup = origStr;
+    token.info   = 'escape';
+  }
+
+  state.pos = pos + 1;
   return true;
 };
diff --git a/lib/rules_inline/fragments_join.js b/lib/rules_inline/fragments_join.js
@@ -9,7 +9,7 @@
 'use strict';
 
 
-module.exports = function text_collapse(state) {
+module.exports = function fragments_join(state) {
   var curr, last,
       level = 0,
       tokens = state.tokens,
diff --git a/test/fixtures/markdown-it/smartquotes.txt b/test/fixtures/markdown-it/smartquotes.txt
@@ -164,3 +164,16 @@ Should parse quotes adjacent to inline html, #677:
 <p>“test <br>”</p>
 <p>“<br> test”</p>
 .
+
+Should be escapable:
+.
+"foo"
+
+\"foo"
+
+"foo\"
+.
+<p>“foo”</p>
+<p>&quot;foo&quot;</p>
+<p>&quot;foo&quot;</p>
+.
diff --git a/test/fixtures/markdown-it/typographer.txt b/test/fixtures/markdown-it/typographer.txt
@@ -60,6 +60,13 @@ dupes
 <p>!!! ??? ,</p>
 .
 
+copyright should be escapable
+.
+\(c)
+.
+<p>(c)</p>
+.
+
 
 dashes
 .
@@ -80,6 +87,16 @@ markdownit--awesome
 <p>markdownit–awesome</p>
 .
 
+dashes should be escapable
+.
+foo \-- bar
+
+foo -\- bar
+.
+<p>foo -- bar</p>
+<p>foo -- bar</p>
+.
+
 regression tests for #624
 .
 1---2---3
diff --git a/test/misc.js b/test/misc.js
@@ -254,6 +254,14 @@ describe('Misc', function () {
       md.render('# test\n\n - hello\n - world\n')
     );
   });
+
+  it('Should escape surrogate pairs (coverage)', function () {
+    var md = markdownit();
+
+    assert.strictEqual(md.render('\\\uD835\uDC9C'), '<p>\\\uD835\uDC9C</p>\n');
+    assert.strictEqual(md.render('\\\uD835x'), '<p>\\\uD835x</p>\n');
+    assert.strictEqual(md.render('\\\uD835'), '<p>\\\uD835</p>\n');
+  });
 });
 
 

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,8 @@ module.exports = {`
`38`	`38`	`rules: [`
`39`	`39`	`'normalize',`
`40`	`40`	`'block',`
`41`		`- 'inline'`
	`41`	`+ 'inline',`
	`42`	`+ 'text_join'`
`42`	`43`	`]`
`43`	`44`	`},`
`44`	`45`
`@@ -73,7 +74,7 @@ module.exports = {`
`73`	`74`	`rules2: [`
`74`	`75`	`'balance_pairs',`
`75`	`76`	`'emphasis',`
`76`		`- 'text_collapse'`
	`77`	`+ 'fragments_join'`
`77`	`78`	`]`
`78`	`79`	`}`
`79`	`80`	`}`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,8 @@ module.exports = {`
`39`	`39`	`rules: [`
`40`	`40`	`'normalize',`
`41`	`41`	`'block',`
`42`		`- 'inline'`
	`42`	`+ 'inline',`
	`43`	`+ 'text_join'`
`43`	`44`	`]`
`44`	`45`	`},`
`45`	`46`
`@@ -55,7 +56,7 @@ module.exports = {`
`55`	`56`	`],`
`56`	`57`	`rules2: [`
`57`	`58`	`'balance_pairs',`
`58`		`- 'text_collapse'`
	`59`	`+ 'fragments_join'`
`59`	`60`	`]`
`60`	`61`	`}`
`61`	`62`	`}`