Skip to content

Commit 7e83b32

Browse files
committed
Implement an attribute normalization routine as described in "3.3.3 Attribute-Value Normalization" section of XML 1.1. spec
https://www.w3.org/TR/xml11/#AVNormalize
1 parent 5ab2d4a commit 7e83b32

File tree

2 files changed

+340
-0
lines changed

2 files changed

+340
-0
lines changed

Changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030

3131
### Misc Changes
3232

33+
- [#371]: New error variant `EscapeError::TooManyNestedEntities` was added.
34+
35+
[#371]: https://github.com/tafia/quick-xml/issues/371
3336
[#806]: https://github.com/tafia/quick-xml/issues/806
3437
[#878]: https://github.com/tafia/quick-xml/pull/878
3538
[#882]: https://github.com/tafia/quick-xml/pull/882

src/escape.rs

Lines changed: 337 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use memchr::{memchr2_iter, memchr3};
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
7+
use std::slice::Iter;
78

89
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910
#[derive(Clone, Debug, PartialEq)]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253
InvalidCharRef(ParseCharRefError),
54+
/// Expanded more than maximum possible entities during attribute normalization.
55+
///
56+
/// Attribute normalization includes expanding of general entities (`&entity;`)
57+
/// which replacement text also could contain entities, which is also must be expanded.
58+
/// If more than 128 entities would be expanded, this error is returned.
59+
TooManyNestedEntities,
5360
}
5461

5562
impl std::fmt::Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673
Self::InvalidCharRef(e) => {
6774
write!(f, "invalid character reference: {}", e)
6875
}
76+
Self::TooManyNestedEntities => {
77+
f.write_str("too many nested entities in an attribute value")
78+
}
6979
}
7080
}
7181
}
@@ -419,6 +429,218 @@ fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: c
419429

420430
////////////////////////////////////////////////////////////////////////////////////////////////////
421431

432+
const fn is_normalization_char(b: &u8) -> bool {
433+
// The following sequences should be translated into a single `\n` (U+000a) character
434+
// to normalize EOLs:
435+
//
436+
// |UTF-8 |String|
437+
// |--------|------|
438+
// |0d 0a |\r\n |
439+
// |0d c2 85|\r\x85|
440+
// |0d |\r |
441+
// |c2 85 |\x85 |
442+
// |e2 80 a8|\x2028|
443+
matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&')
444+
}
445+
446+
/// Returns the attribute value normalized as per [the XML specification],
447+
/// using a custom entity resolver.
448+
///
449+
/// Do not use this method with HTML attributes.
450+
///
451+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
452+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
453+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
454+
/// take precedence.
455+
///
456+
/// This will allocate unless the raw attribute value does not require normalization.
457+
///
458+
/// # Parameters
459+
///
460+
/// - `value`: unnormalized attribute value
461+
/// - `depth`: maximum number of nested entities that can be expanded. If expansion
462+
/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
463+
/// - `resolve_entity`: a function to resolve entity. This function could be called
464+
/// multiple times on the same input and can return different values in each case
465+
/// for the same input, although it is not recommended
466+
///
467+
/// # Lifetimes
468+
///
469+
/// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
470+
/// the input returned unchanged with the same lifetime
471+
/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
472+
///
473+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
474+
pub(crate) fn normalize_attribute_value<'input, 'entity, F>(
475+
value: &'input str,
476+
depth: usize,
477+
mut resolve_entity: F,
478+
) -> Result<Cow<'input, str>, EscapeError>
479+
where
480+
// the lifetime of the output comes from a capture or is `'static`
481+
F: FnMut(&str) -> Option<&'entity str>,
482+
{
483+
let mut iter = value.as_bytes().iter();
484+
485+
// If we found the charater that requires normalization, create a normalized
486+
// version of the attribute, otherwise return the value unchanged
487+
if let Some(i) = iter.position(is_normalization_char) {
488+
let mut normalized = String::with_capacity(value.len());
489+
let pos = normalize_step(
490+
&mut normalized,
491+
&mut iter,
492+
value,
493+
0,
494+
i,
495+
depth,
496+
&mut resolve_entity,
497+
)?;
498+
499+
normalize_steps(
500+
&mut normalized,
501+
&mut iter,
502+
value,
503+
pos,
504+
depth,
505+
&mut resolve_entity,
506+
)?;
507+
return Ok(normalized.into());
508+
}
509+
Ok(Cow::Borrowed(value))
510+
}
511+
512+
fn normalize_steps<'entity, F>(
513+
normalized: &mut String,
514+
iter: &mut Iter<u8>,
515+
input: &str,
516+
mut pos: usize,
517+
depth: usize,
518+
resolve_entity: &mut F,
519+
) -> Result<(), EscapeError>
520+
where
521+
// the lifetime of the output comes from a capture or is `'static`
522+
F: FnMut(&str) -> Option<&'entity str>,
523+
{
524+
while let Some(i) = iter.position(is_normalization_char) {
525+
pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?;
526+
}
527+
if let Some(rest) = input.get(pos..) {
528+
normalized.push_str(rest);
529+
}
530+
Ok(())
531+
}
532+
533+
/// Performs one step of the [normalization algorithm] (but with recursive part):
534+
///
535+
/// 1. For a character reference, append the referenced character
536+
/// to the normalized value.
537+
/// 2. For an entity reference, recursively apply this algorithm
538+
/// to the replacement text of the entity.
539+
/// 3. For a white space character (#x20, #xD, #xA, #x9), append
540+
/// a space character (#x20) to the normalized value.
541+
/// 4. For another character, append the character to the normalized value.
542+
///
543+
/// Because [according to the specification], XML parser should parse line-of-end
544+
/// normalized input, but quick-xml does not do that, this function also performs
545+
/// normalization of EOL characters. That should be done before expanding entities
546+
/// and character references, so cannot be processed later.
547+
///
548+
/// This function could be used also just to normalize line ends if the iterator
549+
/// won't be stop on `&` characters.
550+
///
551+
/// # Parameters
552+
///
553+
/// - `normalized`: Output of the algorithm. Normalized value will be placed here
554+
/// - `iter`: Iterator over bytes of `input`
555+
/// - `input`: Original non-normalized value
556+
/// - `last_pos`: Index of the last byte in `input` that was processed
557+
/// - `index`: Index of the byte in `input` that should be processed now
558+
/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
559+
/// so this parameter tracks if we seen the `\r` before processing the current byte
560+
/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
561+
/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
562+
///
563+
/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
564+
/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
565+
fn normalize_step<'entity, F>(
566+
normalized: &mut String,
567+
iter: &mut Iter<u8>,
568+
input: &str,
569+
last_pos: usize,
570+
index: usize,
571+
depth: usize,
572+
resolve_entity: &mut F,
573+
) -> Result<usize, EscapeError>
574+
where
575+
// the lifetime of the output comes from a capture or is `'static`
576+
F: FnMut(&str) -> Option<&'entity str>,
577+
{
578+
if depth == 0 {
579+
return Err(EscapeError::TooManyNestedEntities);
580+
}
581+
// 4. For another character, append the character to the normalized value.
582+
normalized.push_str(&input[last_pos..index]);
583+
584+
match input.as_bytes()[index] {
585+
b'&' => {
586+
let start = index + 1; // +1 - skip `&`
587+
let end = start
588+
+ match iter.position(|&b| b == b';') {
589+
Some(end) => end,
590+
None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
591+
};
592+
593+
// Content between & and ; - &pat;
594+
// Note, that this content have non-normalized EOLs as required by the specification,
595+
// but because numbers in any case cannot have spaces inside, this is not the problem.
596+
// Normalization of spaces in entity references and checking that they corresponds to
597+
// [`Name`] production on conscience `resolve_entity`.
598+
//
599+
// [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
600+
let pat = &input[start..end];
601+
// 1. For a character reference, append the referenced character
602+
// to the normalized value.
603+
if pat.starts_with('#') {
604+
let entity = &pat[1..]; // starts after the #
605+
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
606+
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
607+
} else
608+
// 2. For an entity reference, recursively apply this algorithm
609+
// to the replacement text of the entity.
610+
if let Some(value) = resolve_entity(pat) {
611+
normalize_steps(
612+
normalized,
613+
&mut value.as_bytes().iter(),
614+
value,
615+
0,
616+
depth.saturating_sub(1),
617+
resolve_entity,
618+
)?;
619+
} else {
620+
return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string()));
621+
}
622+
Ok(end + 1) // +1 - skip `;`
623+
}
624+
// 3. For a white space character (#x20, #xD, #xA, #x9), append
625+
// a space character (#x20) to the normalized value.
626+
// Space character has no special meaning, so it is handled on step 4
627+
b'\t' => {
628+
normalized.push(' ');
629+
Ok(index + 1) // +1 - skip \t
630+
}
631+
_ => {
632+
let pos = normalize_eol_step(normalized, input.as_bytes(), index, ' ');
633+
// We should advance iterator because we may skip several characters
634+
for _ in 0..pos - index - 1 {
635+
iter.next();
636+
}
637+
Ok(pos)
638+
}
639+
}
640+
}
641+
642+
////////////////////////////////////////////////////////////////////////////////////////////////////
643+
422644
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
423645
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
424646
///
@@ -2022,4 +2244,119 @@ mod normalization {
20222244
);
20232245
}
20242246
}
2247+
2248+
mod attribute {
2249+
use super::*;
2250+
use pretty_assertions::assert_eq;
2251+
2252+
#[test]
2253+
fn empty() {
2254+
assert_eq!(
2255+
normalize_attribute_value("", 5, |_| { None }),
2256+
Ok("".into())
2257+
);
2258+
}
2259+
2260+
#[test]
2261+
fn only_spaces() {
2262+
assert_eq!(
2263+
normalize_attribute_value(" ", 5, |_| { None }),
2264+
Ok(" ".into())
2265+
);
2266+
assert_eq!(
2267+
normalize_attribute_value("\t\t\t", 5, |_| { None }),
2268+
Ok(" ".into())
2269+
);
2270+
assert_eq!(
2271+
normalize_attribute_value("\r\r\r", 5, |_| { None }),
2272+
Ok(" ".into())
2273+
);
2274+
assert_eq!(
2275+
normalize_attribute_value("\n\n\n", 5, |_| { None }),
2276+
Ok(" ".into())
2277+
);
2278+
}
2279+
2280+
#[test]
2281+
fn already_normalized() {
2282+
assert_eq!(
2283+
normalize_attribute_value("already normalized", 5, |_| { None }),
2284+
Ok("already normalized".into())
2285+
);
2286+
}
2287+
2288+
#[test]
2289+
fn characters() {
2290+
assert_eq!(
2291+
normalize_attribute_value("string with &#32; character", 5, |_| { None }),
2292+
Ok("string with character".into())
2293+
);
2294+
assert_eq!(
2295+
normalize_attribute_value("string with &#x20; character", 5, |_| { None }),
2296+
Ok("string with character".into())
2297+
);
2298+
}
2299+
2300+
#[test]
2301+
fn entities() {
2302+
assert_eq!(
2303+
normalize_attribute_value("string with &entity; reference", 5, |_| {
2304+
Some("replacement")
2305+
}),
2306+
Ok("string with replacement reference".into())
2307+
);
2308+
assert_eq!(
2309+
normalize_attribute_value("string with &entity-1; reference", 5, |entity| {
2310+
match entity {
2311+
"entity-1" => Some("recursive &entity-2;"),
2312+
"entity-2" => Some("entity&#32;2"),
2313+
_ => None,
2314+
}
2315+
}),
2316+
Ok("string with recursive entity 2 reference".into())
2317+
);
2318+
}
2319+
2320+
#[test]
2321+
fn unclosed_entity() {
2322+
assert_eq!(
2323+
normalize_attribute_value("string with unclosed &entity reference", 5, |_| {
2324+
// 0 ^ = 21 ^ = 38
2325+
Some("replacement")
2326+
}),
2327+
Err(EscapeError::UnterminatedEntity(21..38))
2328+
);
2329+
assert_eq!(
2330+
normalize_attribute_value(
2331+
"string with unclosed &#32 (character) reference",
2332+
// ^ = 21 ^ = 47
2333+
5,
2334+
|_| { None }
2335+
),
2336+
Err(EscapeError::UnterminatedEntity(21..47))
2337+
);
2338+
}
2339+
2340+
#[test]
2341+
fn unknown_entity() {
2342+
assert_eq!(
2343+
normalize_attribute_value("string with unknown &entity; reference", 5, |_| {
2344+
// 0 ^ ^ = 21..27
2345+
None
2346+
}),
2347+
Err(EscapeError::UnrecognizedEntity(
2348+
21..27,
2349+
"entity".to_string(),
2350+
))
2351+
);
2352+
}
2353+
2354+
#[test]
2355+
fn recursive_entity() {
2356+
assert_eq!(
2357+
normalize_attribute_value("&entity; reference", 5, |_| Some("recursive &entity;")),
2358+
Err(EscapeError::TooManyNestedEntities),
2359+
);
2360+
}
2361+
}
20252362
}

0 commit comments

Comments
 (0)