From 06c3206b85c310a61328f59f68619245a1cfc601 Mon Sep 17 00:00:00 2001 From: Mikkel Kjeldsen Date: Thu, 2 Nov 2023 19:35:13 +0100 Subject: [PATCH 1/3] Implement limited supported for fenced code blocks This is a functional implementation of a subset of CommonMark's fenced code blocks with backtick or tilde code fences: ```py print("Hello, fenced code block") ``` is now functionally equivalent to print("Hello, fenced code block") The implementation disregards some irrelevant strictness such as "infostring" and trailing character limitations, and because commitmsgfmt has no block scoping it is not possible to implement an implicit closing code fence when the enclosing block ends. Fenced code blocks tend to be used sparingly; generally much less than indented code blocks, which already are a small fraction of total commits. The two tables below show the total and relative commit counts according to $ git log --oneline --grep "$PATTERN" | wc -l for Linux, Git, and 8 other arbitrarily selected projects with >=10,000 commits: PROJECT | PATTERN | '' | '^ ' | '^ *```' | '^ *~~~' -------------------------------------+------------+----------+-----------+--------- ansible/ansible@v2.11.0 | 51,316 | 472 | 156 | 4 apache/lucene@releases/lucene/9.8.0 | 36,461 | 49 | 4 | 0 apache/maven@maven-3.9.5 | 10,931 | 34 | 1 | 0 apache/spark@v3.5.0 | 37,754 | 2,258 | 6,478 | 28 apache/tomcat@10.1.15 | 25,316 | 31 | 1 | 0 git/git@v2.42.0 | 70,876 | 3,250 | 8 | 9 postgres/postgres@REL_16_0 | 56,562 | 672 | 0 | 0 puppetlabs/puppet@8.3.0 | 34,657 | 594 | 85 | 1 spring-project/spring-boot@v3.1.5 | 44,688 | 40 | 5 | 0 torvalds/linux@v6.6 | 1,217,245 | 55,197 | 154 | 423 PROJECT | PATTERN | '' | '^ ' | '^ *```' | '^ *~~~' -------------------------------------+------------+----------+-----------+--------- a/ansible | 100.00% | 0.92% | 0.30% | 0.01% a/lucene | 100.00% | 0.13% | 0.01% | 0.00% a/maven | 100.00% | 0.31% | 0.01% | 0.00% a/spark | 100.00% | 5.98% | 17.16% | 0.07% a/tomcat | 100.00% | 0.12% | 0.00% | 0.00% g/git | 100.00% | 4.59% | 0.01% | 0.01% p/postgres | 100.00% | 1.19% | 0.00% | 0.00% p/puppet | 100.00% | 1.71% | 0.25% | 0.00% s/spring-boot | 100.00% | 0.09% | 0.01% | 0.00% t/linux | 100.00% | 4.53% | 0.01% | 0.03% The tables show considerable deviation in overall code block usage, small deviation in backtick fenced code block usage, and fairly consistent tilde fenced code block (non-)usage. Apache Spark is a wild outlier in every code block dimension but spot checks suggest the stated numbers are not false positives, and further indicate that Apache Spark relies on squash-merging GitHub pull requests for integrating code -- exactly the use-case fenced code blocks is expected to support well. References: https://gitlab.com/mkjeldsen/commitmsgfmt/-/issues/7 References: https://spec.commonmark.org/0.30/#fenced-code-blocks --- src/commitmsgfmt.rs | 81 +++- src/parser.rs | 912 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 992 insertions(+), 1 deletion(-) diff --git a/src/commitmsgfmt.rs b/src/commitmsgfmt.rs index e862fc1..ec560b2 100644 --- a/src/commitmsgfmt.rs +++ b/src/commitmsgfmt.rs @@ -37,7 +37,8 @@ impl CommitMsgFmt { fn reflow_into(&self, buf: &mut String, msg: &[Token]) { for tok in msg { match *tok { - BlockQuote(s) | Comment(s) | Literal(s) | Scissored(s) | Trailer(s) => { + BlockQuote(s) | Comment(s) | FencedCodeBlock(s) | Literal(s) | Scissored(s) + | Trailer(s) => { buf.push_str(s); } ListItem(ref indent, ref li, ref s) => { @@ -214,6 +215,84 @@ foo assert_eq!(filter(72, &input), expected); } + #[test] + fn preserves_fenced_code_block() { + let input = " +foo + +``` +backtick +fenced +code +block +``` + +~~~ +tilde +fenced +code +block +~~~ +"; + + let expected = " +foo + +``` +backtick +fenced +code +block +``` + +~~~ +tilde +fenced +code +block +~~~ +"; + + assert_eq!(filter(72, &input), expected); + } + + #[test] + fn preserves_fenced_code_block_interrupting_paragraph() { + let input = " +foo + +a +``` +backtick +``` +b + +c +~~~ +tilde +~~~ +d +"; + + let expected = " +foo + +a +``` +backtick +``` +b + +c +~~~ +tilde +~~~ +d +"; + + assert_eq!(filter(72, &input), expected); + } + #[test] fn preserves_block_quote() { let input = " diff --git a/src/parser.rs b/src/parser.rs index 304fbfa..59a3402 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,6 +12,7 @@ type ReflowableStrBuf<'input> = Cow<'input, str>; #[derive(Debug, PartialEq, Eq)] pub enum Token<'input> { Comment(&'input str), + FencedCodeBlock(&'input str), Footnote(&'input str, ReflowableStrBuf<'input>), ListItem( ListIndent<'input>, @@ -27,6 +28,24 @@ pub enum Token<'input> { VerticalSpace, } +// Token::FencedCodeBlock is the first block construct that can interrupt another block construct; +// that cannot be identified purely from either the current line or the previous token; and that +// may not be wrapped. That means there are valid situations we cannot represent without more +// sophisticated state management. We also don't want to needlessly extend a String. We'll just +// track the opening code fence, match it against closing fences, and treat the entire block as a +// glorified Token::Literal sequence. We accept the loss of composability. +struct CodeFence<'input>(&'input str); + +impl CodeFence<'_> { + fn is_closed_by(&self, line: &str) -> bool { + // "until a closing code fence of the same type as the code block began with (backticks or + // tildes), and with at least as many backticks or tildes as the opening code fence" + line_as_code_fence(line) + .map(|fence| fence.0.starts_with(self.0)) + .unwrap_or(false) + } +} + pub fn parse(input: &str, comment_char: char) -> Vec { let mut toks = Vec::new(); @@ -34,9 +53,18 @@ pub fn parse(input: &str, comment_char: char) -> Vec { let mut has_scissors = false; let lines = input.lines(); let mut px = false; + let mut in_code_fence: Option = None; for line in lines { if has_scissors { toks.push(Token::Scissored(line)); + } else if let Some(ref fence) = in_code_fence { + toks.push(Token::FencedCodeBlock(line)); + if fence.is_closed_by(line) { + in_code_fence = None; + } + } else if let Some(fence) = line_as_code_fence(line) { + toks.push(Token::FencedCodeBlock(line)); + in_code_fence = Some(fence); } else if line.starts_with(comment_char) { let t = if &line[1..] == " ------------------------ >8 ------------------------" { has_scissors = true; @@ -317,6 +345,70 @@ fn line_as_list_item(line: &str) -> Option { }) } +fn line_as_code_fence(line: &'_ str) -> Option { + enum FenceState { + New, + IndentSp1, + IndentSp2, + IndentSp3, + Backtick, + Tilde, + } + + let mut fence_state = FenceState::New; + let mut ix_fence_start = 0; + let mut fence_length = 0; + let mut tally = || fence_length += 1; + // https://spec.commonmark.org/0.30/#fenced-code-blocks + for (ix, c) in line.char_indices() { + match fence_state { + FenceState::New + | FenceState::IndentSp1 + | FenceState::IndentSp2 + | FenceState::IndentSp3 => { + ix_fence_start = ix; + // "preceded by up to three spaces of indentation" + fence_state = match c { + ' ' => match fence_state { + FenceState::New => FenceState::IndentSp1, + FenceState::IndentSp1 => FenceState::IndentSp2, + FenceState::IndentSp2 => FenceState::IndentSp3, + _ => break, + }, + '`' => { + tally(); + FenceState::Backtick + } + '~' => { + tally(); + FenceState::Tilde + } + _ => break, + }; + } + // "Tildes and backticks cannot be mixed." + FenceState::Backtick => match c { + '`' => tally(), + _ => break, + }, + FenceState::Tilde => match c { + '~' => tally(), + _ => break, + }, + } + } + + // "at least three consecutive backtick characters (`) or tildes (~)" + if fence_length >= 3 { + let ix_end = ix_fence_start + fence_length; + debug_assert!(ix_end <= line.len()); + let fence = &line[ix_fence_start..ix_end]; + Some(CodeFence(fence)) + } else { + None + } +} + fn line_as_line_block_quote(line: &str) -> Option { if line.starts_with('>') { Some(Token::BlockQuote(line)) @@ -677,6 +769,826 @@ some other paragraph ); } + #[test] + fn parses_codefence_backtick_verbatim() { + let input = " +subject + +``` +backtick +``` + + ``` + backtick + ``` + + ``` + backtick + ``` + + ``` + backtick + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_indented_aligned_4sp_not_fenced_code_block() { + let input = " +subject + + ``` + backtick + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Literal(" ```"), + Literal(" backtick"), + Literal(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_indented_unaligned() { + let input = " +subject + + ``` +backtick 1 0 +``` + ``` +backtick 2 0 +``` + ``` +backtick 3 0 +``` +``` +backtick 0 1 + ``` +``` +backtick 0 2 + ``` +``` +backtick 0 3 + ``` + ``` +backtick 2 1 + ``` + ``` +backtick 3 2 + ``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 1 0"), + FencedCodeBlock("```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 2 0"), + FencedCodeBlock("```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 3 0"), + FencedCodeBlock("```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 1"), + FencedCodeBlock(" ```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 2"), + FencedCodeBlock(" ```"), + FencedCodeBlock("```"), + FencedCodeBlock("backtick 0 3"), + FencedCodeBlock(" ```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 2 1"), + FencedCodeBlock(" ```"), + FencedCodeBlock(" ```"), + FencedCodeBlock("backtick 3 2"), + FencedCodeBlock(" ```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_extra_long() { + let input = " +subject + +``` +backtick 3 4 +```` +```` +backtick 4 5 +````` +````` +backtick 5 6 +`````` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick 3 4"), + FencedCodeBlock("````"), + FencedCodeBlock("````"), + FencedCodeBlock("backtick 4 5"), + FencedCodeBlock("`````"), + FencedCodeBlock("`````"), + FencedCodeBlock("backtick 5 6"), + FencedCodeBlock("``````"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_too_short_not_fenced_code_block() { + let input = " +subject + +`` +backtick +`` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("`` backtick ``".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_with_infostring() { + let input = " +subject + +```info +backtick info no leading ws +``` + +``` info +backtick info leading sp +``` + +``` info +backtick info leading tab +``` + +```info` +backtick info accept illegal info with backtick +``` + +```info~ +backtick info accept legal info with tilde +``` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```info"), + FencedCodeBlock("backtick info no leading ws"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("``` info"), + FencedCodeBlock("backtick info leading sp"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```\tinfo"), + FencedCodeBlock("backtick info leading tab"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```info`"), + FencedCodeBlock("backtick info accept illegal info with backtick"), + FencedCodeBlock("```"), + VerticalSpace, + FencedCodeBlock("```info~"), + FencedCodeBlock("backtick info accept legal info with tilde"), + FencedCodeBlock("```"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_can_interrupt_paragraph() { + let input = " +subject + +a +``` +backtick +``` +b +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("a".into()), + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + Paragraph("b".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_fence_unmatched_length() { + let input = " +subject + +```` +backtick +``` + +backtick + +`` +backtick +```` +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("````"), + FencedCodeBlock("backtick"), + FencedCodeBlock("```"), + FencedCodeBlock(""), + FencedCodeBlock("backtick"), + FencedCodeBlock(""), + FencedCodeBlock("``"), + FencedCodeBlock("backtick"), + FencedCodeBlock("````"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_backtick_unterminated() { + let input = " +subject + +``` +backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("```"), + FencedCodeBlock("backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_backtick_enclosed_in_block_quote() { + let input = " +subject + +> a +> ``` +> backtick +> ``` +> b + +> c +> ``` +> backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + BlockQuote("> a"), + BlockQuote("> ```"), + BlockQuote("> backtick"), + BlockQuote("> ```"), + BlockQuote("> b"), + VerticalSpace, + BlockQuote("> c"), + BlockQuote("> ```"), + BlockQuote("> backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_backtick_enclosed_in_list_item() { + let input = " +subject + +- a + ``` + backtick + ``` + b + +- c + ``` + backtick +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "a".into()), + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + FencedCodeBlock(" ```"), + Paragraph("b".into()), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "c".into()), + FencedCodeBlock(" ```"), + FencedCodeBlock(" backtick"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_verbatim() { + let input = " +subject + +~~~ +tilde +~~~ + + ~~~ + tilde + ~~~ + + ~~~ + tilde + ~~~ + + ~~~ + tilde + ~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde"), + FencedCodeBlock("~~~"), + VerticalSpace, + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" tilde"), + FencedCodeBlock(" ~~~"), + VerticalSpace, + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" tilde"), + FencedCodeBlock(" ~~~"), + VerticalSpace, + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" tilde"), + FencedCodeBlock(" ~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_indented_aligned_4sp_not_fenced_code_block() { + let input = " +subject + + ~~~ + tilde + ~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Literal(" ~~~"), + Literal(" tilde"), + Literal(" ~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_indented_unaligned() { + let input = " +subject + + ~~~ +tilde 1 0 +~~~ + ~~~ +tilde 2 0 +~~~ + ~~~ +tilde 3 0 +~~~ +~~~ +tilde 0 1 + ~~~ +~~~ +tilde 0 2 + ~~~ +~~~ +tilde 0 3 + ~~~ + ~~~ +tilde 2 1 + ~~~ + ~~~ +tilde 3 2 + ~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock(" ~~~"), + FencedCodeBlock("tilde 1 0"), + FencedCodeBlock("~~~"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("tilde 2 0"), + FencedCodeBlock("~~~"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("tilde 3 0"), + FencedCodeBlock("~~~"), + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde 0 1"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde 0 2"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde 0 3"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("tilde 2 1"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" ~~~"), + FencedCodeBlock("tilde 3 2"), + FencedCodeBlock(" ~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_fence_extra_long() { + let input = " +subject + +~~~ +tilde 3 4 +~~~~ +~~~~ +tilde 4 5 +~~~~~ +~~~~~ +tilde 5 6 +~~~~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde 3 4"), + FencedCodeBlock("~~~~"), + FencedCodeBlock("~~~~"), + FencedCodeBlock("tilde 4 5"), + FencedCodeBlock("~~~~~"), + FencedCodeBlock("~~~~~"), + FencedCodeBlock("tilde 5 6"), + FencedCodeBlock("~~~~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_fence_too_short_not_fenced_code_block() { + let input = " +subject + +~~ +tilde +~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("~~ tilde ~~".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_with_infostring() { + let input = " +subject + +~~~info +tilde info no leading ws +~~~ + +~~~ info +tilde info leading sp +~~~ + +~~~ info +tilde info leading tab +~~~ + +~~~info` +tilde info accept illegal info with tilde +~~~ + +~~~info~ +tilde info accept legal info with tilde +~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("~~~info"), + FencedCodeBlock("tilde info no leading ws"), + FencedCodeBlock("~~~"), + VerticalSpace, + FencedCodeBlock("~~~ info"), + FencedCodeBlock("tilde info leading sp"), + FencedCodeBlock("~~~"), + VerticalSpace, + FencedCodeBlock("~~~\tinfo"), + FencedCodeBlock("tilde info leading tab"), + FencedCodeBlock("~~~"), + VerticalSpace, + FencedCodeBlock("~~~info`"), + FencedCodeBlock("tilde info accept illegal info with tilde"), + FencedCodeBlock("~~~"), + VerticalSpace, + FencedCodeBlock("~~~info~"), + FencedCodeBlock("tilde info accept legal info with tilde"), + FencedCodeBlock("~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_can_interrupt_paragraph() { + let input = " +subject + +a +~~~ +tilde +~~~ +b +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("a".into()), + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde"), + FencedCodeBlock("~~~"), + Paragraph("b".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_fence_unmatched_length() { + let input = " +subject + +~~~~ +tilde +~~~ + +tilde + +~~ +tilde +~~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("~~~~"), + FencedCodeBlock("tilde"), + FencedCodeBlock("~~~"), + FencedCodeBlock(""), + FencedCodeBlock("tilde"), + FencedCodeBlock(""), + FencedCodeBlock("~~"), + FencedCodeBlock("tilde"), + FencedCodeBlock("~~~~"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn parses_codefence_tilde_unterminated() { + let input = " +subject + +~~~ +tilde +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + FencedCodeBlock("~~~"), + FencedCodeBlock("tilde"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_tilde_enclosed_in_block_quote() { + let input = " +subject + +> a +> ~~~ +> tilde +> ~~~ +> b + +> c +> ~~~ +> tilde +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + BlockQuote("> a"), + BlockQuote("> ~~~"), + BlockQuote("> tilde"), + BlockQuote("> ~~~"), + BlockQuote("> b"), + VerticalSpace, + BlockQuote("> c"), + BlockQuote("> ~~~"), + BlockQuote("> tilde"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + + #[test] + fn bug_parses_codefence_tilde_enclosed_in_list_item() { + let input = " +subject + +- a + ~~~ + tilde + ~~~ + b + +- c + ~~~ + tilde +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "a".into()), + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" tilde"), + FencedCodeBlock(" ~~~"), + Paragraph("b".into()), + VerticalSpace, + ListItem(ListIndent(""), ListType("- "), "c".into()), + FencedCodeBlock(" ~~~"), + FencedCodeBlock(" tilde"), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + #[test] fn parses_block_quote_verbatim() { assert_eq!( From 1e6ea0ae2d792c396d32658d83334dabecfb09f4 Mon Sep 17 00:00:00 2001 From: Mikkel Kjeldsen Date: Sat, 4 Nov 2023 21:55:50 +0100 Subject: [PATCH 2/3] Remove support for tilde fenced code blocks CommonMark allows code fences using the tilde character in addition to the backtick character. Commit 0a10f82 (Implement limited supported for fenced code blocks, 2023-11-02) indicates that this is rarely used in the wild, and in Linux 6.6 and Git 2.42.0 it is used almost exclusively either to emphasize compilation error output or underline headers. The variability in style does not serve commitmsgfmt, nor does the enormous risk of false positives serve users, so remove tilde support entirely. References: https://gitlab.com/mkjeldsen/commitmsgfmt/-/issues/7 References: https://spec.commonmark.org/0.30/#fenced-code-blocks --- src/commitmsgfmt.rs | 42 ++--- src/parser.rs | 444 +++----------------------------------------- 2 files changed, 45 insertions(+), 441 deletions(-) diff --git a/src/commitmsgfmt.rs b/src/commitmsgfmt.rs index ec560b2..98ae48c 100644 --- a/src/commitmsgfmt.rs +++ b/src/commitmsgfmt.rs @@ -226,13 +226,6 @@ fenced code block ``` - -~~~ -tilde -fenced -code -block -~~~ "; let expected = " @@ -244,13 +237,6 @@ fenced code block ``` - -~~~ -tilde -fenced -code -block -~~~ "; assert_eq!(filter(72, &input), expected); @@ -266,12 +252,6 @@ a backtick ``` b - -c -~~~ -tilde -~~~ -d "; let expected = " @@ -282,12 +262,30 @@ a backtick ``` b +"; + + assert_eq!(filter(72, &input), expected); + } + + #[test] + fn ignores_fenced_code_block_with_tilde() { + let input = " +foo -c ~~~ tilde +fenced +code +block +not +supported ~~~ -d +"; + + let expected = " +foo + +~~~ tilde fenced code block not supported ~~~ "; assert_eq!(filter(72, &input), expected); diff --git a/src/parser.rs b/src/parser.rs index 59a3402..f1b4909 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -352,7 +352,6 @@ fn line_as_code_fence(line: &'_ str) -> Option { IndentSp2, IndentSp3, Backtick, - Tilde, } let mut fence_state = FenceState::New; @@ -360,6 +359,9 @@ fn line_as_code_fence(line: &'_ str) -> Option { let mut fence_length = 0; let mut tally = || fence_length += 1; // https://spec.commonmark.org/0.30/#fenced-code-blocks + // Backtick fenced code blocks appear relatively safe to support. Tilde fenced code blocks, on + // the other hand, are unsafe: tildes are often used for emphasizing compilation error output + // or underlining headers. for (ix, c) in line.char_indices() { match fence_state { FenceState::New @@ -379,10 +381,6 @@ fn line_as_code_fence(line: &'_ str) -> Option { tally(); FenceState::Backtick } - '~' => { - tally(); - FenceState::Tilde - } _ => break, }; } @@ -391,10 +389,6 @@ fn line_as_code_fence(line: &'_ str) -> Option { '`' => tally(), _ => break, }, - FenceState::Tilde => match c { - '~' => tally(), - _ => break, - }, } } @@ -817,6 +811,28 @@ backtick assert_eq!(expected, actual); } + #[test] + fn parses_codefence_tilde_not_fenced_code_block() { + let input = " +subject + +~~~ +tilde +~~~ +"; + + let expected = vec![ + VerticalSpace, + Subject("subject"), + VerticalSpace, + Paragraph("~~~ tilde ~~~".into()), + ]; + + let actual = parse(&input); + + assert_eq!(expected, actual); + } + #[test] fn parses_codefence_backtick_indented_aligned_4sp_not_fenced_code_block() { let input = " @@ -1179,416 +1195,6 @@ subject assert_eq!(expected, actual); } - #[test] - fn parses_codefence_tilde_verbatim() { - let input = " -subject - -~~~ -tilde -~~~ - - ~~~ - tilde - ~~~ - - ~~~ - tilde - ~~~ - - ~~~ - tilde - ~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde"), - FencedCodeBlock("~~~"), - VerticalSpace, - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" tilde"), - FencedCodeBlock(" ~~~"), - VerticalSpace, - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" tilde"), - FencedCodeBlock(" ~~~"), - VerticalSpace, - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" tilde"), - FencedCodeBlock(" ~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_indented_aligned_4sp_not_fenced_code_block() { - let input = " -subject - - ~~~ - tilde - ~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - Literal(" ~~~"), - Literal(" tilde"), - Literal(" ~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_indented_unaligned() { - let input = " -subject - - ~~~ -tilde 1 0 -~~~ - ~~~ -tilde 2 0 -~~~ - ~~~ -tilde 3 0 -~~~ -~~~ -tilde 0 1 - ~~~ -~~~ -tilde 0 2 - ~~~ -~~~ -tilde 0 3 - ~~~ - ~~~ -tilde 2 1 - ~~~ - ~~~ -tilde 3 2 - ~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock(" ~~~"), - FencedCodeBlock("tilde 1 0"), - FencedCodeBlock("~~~"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("tilde 2 0"), - FencedCodeBlock("~~~"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("tilde 3 0"), - FencedCodeBlock("~~~"), - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde 0 1"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde 0 2"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde 0 3"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("tilde 2 1"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" ~~~"), - FencedCodeBlock("tilde 3 2"), - FencedCodeBlock(" ~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_fence_extra_long() { - let input = " -subject - -~~~ -tilde 3 4 -~~~~ -~~~~ -tilde 4 5 -~~~~~ -~~~~~ -tilde 5 6 -~~~~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde 3 4"), - FencedCodeBlock("~~~~"), - FencedCodeBlock("~~~~"), - FencedCodeBlock("tilde 4 5"), - FencedCodeBlock("~~~~~"), - FencedCodeBlock("~~~~~"), - FencedCodeBlock("tilde 5 6"), - FencedCodeBlock("~~~~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_fence_too_short_not_fenced_code_block() { - let input = " -subject - -~~ -tilde -~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - Paragraph("~~ tilde ~~".into()), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_with_infostring() { - let input = " -subject - -~~~info -tilde info no leading ws -~~~ - -~~~ info -tilde info leading sp -~~~ - -~~~ info -tilde info leading tab -~~~ - -~~~info` -tilde info accept illegal info with tilde -~~~ - -~~~info~ -tilde info accept legal info with tilde -~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock("~~~info"), - FencedCodeBlock("tilde info no leading ws"), - FencedCodeBlock("~~~"), - VerticalSpace, - FencedCodeBlock("~~~ info"), - FencedCodeBlock("tilde info leading sp"), - FencedCodeBlock("~~~"), - VerticalSpace, - FencedCodeBlock("~~~\tinfo"), - FencedCodeBlock("tilde info leading tab"), - FencedCodeBlock("~~~"), - VerticalSpace, - FencedCodeBlock("~~~info`"), - FencedCodeBlock("tilde info accept illegal info with tilde"), - FencedCodeBlock("~~~"), - VerticalSpace, - FencedCodeBlock("~~~info~"), - FencedCodeBlock("tilde info accept legal info with tilde"), - FencedCodeBlock("~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_can_interrupt_paragraph() { - let input = " -subject - -a -~~~ -tilde -~~~ -b -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - Paragraph("a".into()), - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde"), - FencedCodeBlock("~~~"), - Paragraph("b".into()), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_fence_unmatched_length() { - let input = " -subject - -~~~~ -tilde -~~~ - -tilde - -~~ -tilde -~~~~ -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock("~~~~"), - FencedCodeBlock("tilde"), - FencedCodeBlock("~~~"), - FencedCodeBlock(""), - FencedCodeBlock("tilde"), - FencedCodeBlock(""), - FencedCodeBlock("~~"), - FencedCodeBlock("tilde"), - FencedCodeBlock("~~~~"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn parses_codefence_tilde_unterminated() { - let input = " -subject - -~~~ -tilde -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - FencedCodeBlock("~~~"), - FencedCodeBlock("tilde"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn bug_parses_codefence_tilde_enclosed_in_block_quote() { - let input = " -subject - -> a -> ~~~ -> tilde -> ~~~ -> b - -> c -> ~~~ -> tilde -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - BlockQuote("> a"), - BlockQuote("> ~~~"), - BlockQuote("> tilde"), - BlockQuote("> ~~~"), - BlockQuote("> b"), - VerticalSpace, - BlockQuote("> c"), - BlockQuote("> ~~~"), - BlockQuote("> tilde"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - - #[test] - fn bug_parses_codefence_tilde_enclosed_in_list_item() { - let input = " -subject - -- a - ~~~ - tilde - ~~~ - b - -- c - ~~~ - tilde -"; - - let expected = vec![ - VerticalSpace, - Subject("subject"), - VerticalSpace, - ListItem(ListIndent(""), ListType("- "), "a".into()), - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" tilde"), - FencedCodeBlock(" ~~~"), - Paragraph("b".into()), - VerticalSpace, - ListItem(ListIndent(""), ListType("- "), "c".into()), - FencedCodeBlock(" ~~~"), - FencedCodeBlock(" tilde"), - ]; - - let actual = parse(&input); - - assert_eq!(expected, actual); - } - #[test] fn parses_block_quote_verbatim() { assert_eq!( From c721e8f7bd4bae3df24036276e22e2241b08c15d Mon Sep 17 00:00:00 2001 From: Mikkel Kjeldsen Date: Sun, 5 Nov 2023 15:49:57 +0100 Subject: [PATCH 3/3] Document fenced code blocks References: https://gitlab.com/mkjeldsen/commitmsgfmt/-/issues/7 --- CHANGELOG.md | 8 ++++ doc/commitmsgfmt.1.adoc | 88 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 86 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c10f7e..6fe2bb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ understanding of patterns often seen in commit messages. in their entirety, and allow them to follow a preceding paragraph without the empty line that is otherwise usually required. +- #7: Recognize fenced code blocks with backtick code fences (` ``` `) and + preserve them in their entirety. Do not recognize tilde code fences (`~~~`), + which are virtually never used in practice and which would interfere with + many other uses. Per CommonMark 0.3.0 a code fence must be at least three + characters long and may optionally be indented up to three spaces, and the + closing code fence must be at least as long as the opening code fence + ignoring whitespace. + - If `--width` is specified multiple times, ignore all but the last occurrence. ## 1.5.0 - 2022-07-30 diff --git a/doc/commitmsgfmt.1.adoc b/doc/commitmsgfmt.1.adoc index 6eec6e5..43acaa7 100644 --- a/doc/commitmsgfmt.1.adoc +++ b/doc/commitmsgfmt.1.adoc @@ -58,6 +58,10 @@ The exact formatting behavior is implementation defined. This section attempts to describe the rules but deviation may be either an inexpediency in the implementation or an error in the description. +{self} does not attempt to recognize AsciiDoc, CommonMark, Markdown, +reStructuredText, or any other markup language. Formally, {self} recognizes +only plain text, and similarity to any markup language is incidental. + NOTE: Depending on your expectations of merge commit messages you may find {self} unsuitable for use in such messages. If you write them precisely like you write non-merge commit messages, go ahead and format them with {self}. @@ -95,9 +99,9 @@ those cases and avoid them by preventing wrapping: it will references both preserve their context and don't degenerate into _list items_. -_Block quotes_ are exempt from the requirement of surrounding blank lines and -will never be considered to belong to a paragraph. A block quote embedded -inside a paragraph has the same effect on that paragraph as an empty line has. +A paragraph may be interrupted by _block quotes_ and _fenced code blocks_, +meaning these are exempt from the requirement of surrounding blank lines and +will never be considered to belong to a paragraph. === Subject line @@ -196,8 +200,64 @@ literals. === Literal -A line starting with one tab or four spaces is considered a _literal_. Literals -are printed verbatim, making them suitable for listings and tables. +A line starting with one tab or four spaces is considered a _literal_: + +---- +paragraph + + literal + +paragraph +---- + +Literals are printed verbatim, making them suitable for listings and tables. + +See also _fenced code block_. + +=== Code fence + +Outside of a _fenced code block_ a line starting with up to 3 spaces followed +by at least 3 consecutive backticks (*`*) is considered an _opening code +fence_: + +---- +```opening +---- + +Within a fenced code block a line starting with up to 3 spaces followed by at +least as many consecutive backticks as the preceding opening code fence is +considered a _closing code fence_; any sequence of fewer backticks is ignored: + +---- + ````opening +``` + ````` +---- + +NOTE: For sake of compatibility, tilde (*~*) cannot be used in place of +backtick. + +=== Fenced code block + +A _fenced code block_ begins with an _opening code fence_ and ends with the +first following _closing code fence_: + +---- +Compare the previous version of origin/topic with the current version: +```sh +$ git range-diff origin/main origin/topic@{1} origin/topic +``` +---- + +The fenced code block includes both code fences and all contents in-between the +code fences. + +Fenced code blocks are printed verbatim, making them suitable for listings. +Fenced code blocks are more flexible in their use than _literals_ are but +otherwise solve the same problem. + +A fenced code block may interrupt a _paragraph_; it needs no preceding or +following blank line. === Block quote @@ -226,11 +286,9 @@ vip:!fmt -w72 -p'>' ---- ==== -Unlike other constructs a block quote may be embedded inside a _paragraph_ with -no preceding or following blank line; the block quote will not be folded into -the paragraph and the paragraph will otherwise observe standard behavior. This -enables a common pattern of immediately preceding the block quote with an -author attribution, illustrated above. +A block quote may interrupt a _paragraph_; it needs no preceding or following +blank line. This enables a common pattern of immediately preceding the block +quote with an author attribution, illustrated above. === Comment @@ -309,6 +367,11 @@ foo baar -- baz qux wupwupwup [1][2] [wup] hex: > 0 1 2 3 4 5 6 7 8 9 a b c d e f +chicken: +```chicken +chicken chicken +``` + - foo 1. foo bar baz @@ -332,6 +395,11 @@ wupwupwup [1][2] [wup] hex: > 0 1 2 3 4 5 6 7 8 9 a b c d e f +chicken: +```chicken +chicken chicken +``` + - foo 1. foo bar baz