Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@

### Bug Fixes

- [#895]: Fix incorrect normalization of `\rX` EOL sequences where `X` is a char which is
UTF-8 encoded as [c2 xx], except [c2 85].

### Misc Changes

- [#895]: Add new `xml10_content()` and `xml11_content()` methods which behaves the same as
`html_content()` and `xml_content()` methods, but express intention more clearly.

[#895]: https://github.com/tafia/quick-xml/pull/895


## 0.38.2 -- 2025-08-19

Expand Down
77 changes: 44 additions & 33 deletions src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////

// TODO: It would be better to reuse buffer after decoding if possible
pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> {
pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> {
let bytes = text.as_bytes();

// The following sequences of UTF-8 encoded input should be translated into
Expand All @@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
// we are sure that index within string
normalized.push_str(&text[0..i]);

let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n');
let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n');
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
let index = pos + i;
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
// we are sure that index within string
normalized.push_str(&text[pos..index]);
pos = normalize_xml_eol_step(&mut normalized, text, index, '\n');
pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n');
}
if let Some(rest) = text.get(pos..) {
normalized.push_str(rest);
Expand Down Expand Up @@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
///
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
let input = text.as_bytes();
match input[index] {
b'\r' => {
Expand All @@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
normalized.push(ch);
return index + 2; // skip \r\n
}
// Because input is correct UTF-8 and in UTF-8 every character has
// an unique prefix, byte C2 means only start of #x85 character
if next == 0xC2 {
// UTF-8 encoding of #x85 character is [c2 85]
if index + 2 < input.len() && input[index + 2] == 0x85 {
normalized.push(ch);
} else {
normalized.push(ch);
// NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
// we are sure that index within string
normalized.push_str(&text[index..index + 3]);
normalized.push_str(&text[index + 1..index + 3]);
}
return index + 3; // skip \r + UTF-8 encoding of character (c2 xx)
}
Expand Down Expand Up @@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
////////////////////////////////////////////////////////////////////////////////////////////////////

// TODO: It would be better to reuse buffer after decoding if possible
pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> {
pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> {
let bytes = text.as_bytes();

// The following sequences of UTF-8 encoded input should be translated into
Expand All @@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
// we are sure that index within string
normalized.push_str(&text[0..i]);

let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n');
let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n');
while let Some(i) = memchr(b'\r', &bytes[pos..]) {
let index = pos + i;
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
// we are sure that index within string
normalized.push_str(&text[pos..index]);
pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n');
pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n');
}
if let Some(rest) = text.get(pos..) {
normalized.push_str(rest);
Expand All @@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
/// - `ch`: a character that should be put to the string instead of newline sequence
///
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
fn normalize_xml10_eol_step(
normalized: &mut String,
input: &[u8],
index: usize,
ch: char,
) -> usize {
match input[index] {
b'\r' => {
normalized.push(ch);
Expand Down Expand Up @@ -2062,56 +2067,59 @@ mod normalization {
mod eol {
use super::*;

mod xml {
mod xml11 {
use super::*;
use pretty_assertions::assert_eq;

#[test]
fn empty() {
assert_eq!(normalize_xml_eols(""), "");
assert_eq!(normalize_xml11_eols(""), "");
}

#[test]
fn already_normalized() {
assert_eq!(
normalize_xml_eols("\nalready \n\n normalized\n"),
normalize_xml11_eols("\nalready \n\n normalized\n"),
"\nalready \n\n normalized\n",
);
}

#[test]
fn cr_lf() {
assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
assert_eq!(
normalize_xml11_eols("\r\nsome\r\n\r\ntext"),
"\nsome\n\ntext"
);
}

#[test]
fn cr_u0085() {
assert_eq!(
normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
"\nsome\n\ntext",
);
}

#[test]
fn u0085() {
assert_eq!(
normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"),
normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"),
"\nsome\n\ntext",
);
}

#[test]
fn u2028() {
assert_eq!(
normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"),
normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"),
"\nsome\n\ntext",
);
}

#[test]
fn mixed() {
assert_eq!(
normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
"\n\n\n\n\n\nsome\n\n\ntext",
);
}
Expand All @@ -2138,9 +2146,9 @@ mod normalization {

dbg!((input, &description));
if ch == '\u{0085}' {
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
} else {
assert_eq!(normalize_xml_eols(input), input, "{}", description);
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
}
}
assert_eq!((first..=last).count(), 64);
Expand Down Expand Up @@ -2171,9 +2179,12 @@ mod normalization {

dbg!((input, &description));
if ch == '\u{0085}' {
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
} else {
assert_eq!(normalize_xml_eols(input), input, "{}", description);
let mut expected = utf8.clone();
expected[0] = b'\n';
let expected = std::str::from_utf8(&expected).expect(&description);
assert_eq!(normalize_xml11_eols(input), expected, "{}", description);
}
}
assert_eq!((first..=last).count(), 64);
Expand Down Expand Up @@ -2204,68 +2215,68 @@ mod normalization {

dbg!((input, &description));
if ch == '\u{2028}' {
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
} else {
assert_eq!(normalize_xml_eols(input), input, "{}", description);
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
}
}
assert_eq!((first..=last).count(), 4096);
}
}

mod html {
mod xml10 {
use super::*;
use pretty_assertions::assert_eq;

#[test]
fn empty() {
assert_eq!(normalize_html_eols(""), "");
assert_eq!(normalize_xml10_eols(""), "");
}

#[test]
fn already_normalized() {
assert_eq!(
normalize_html_eols("\nalready \n\n normalized\n"),
normalize_xml10_eols("\nalready \n\n normalized\n"),
"\nalready \n\n normalized\n",
);
}

#[test]
fn cr_lf() {
assert_eq!(
normalize_html_eols("\r\nsome\r\n\r\ntext"),
normalize_xml10_eols("\r\nsome\r\n\r\ntext"),
"\nsome\n\ntext"
);
}

#[test]
fn cr_u0085() {
assert_eq!(
normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
"\n\u{0085}some\n\u{0085}\n\u{0085}text",
);
}

#[test]
fn u0085() {
assert_eq!(
normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"),
normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"),
"\u{0085}some\u{0085}\u{0085}text",
);
}

#[test]
fn u2028() {
assert_eq!(
normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"),
normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"),
"\u{2028}some\u{2028}\u{2028}text",
);
}

#[test]
fn mixed() {
assert_eq!(
normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
"\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text",
);
}
Expand Down
Loading
Loading