@@ -4,6 +4,7 @@ use memchr::{memchr2_iter, memchr3};
44use std:: borrow:: Cow ;
55use std:: num:: ParseIntError ;
66use std:: ops:: Range ;
7+ use std:: slice:: Iter ;
78
89/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910#[ derive( Clone , Debug , PartialEq ) ]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051 /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152 /// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253 InvalidCharRef ( ParseCharRefError ) ,
54+ /// Expanded more than maximum possible entities during attribute normalization.
55+ ///
56+ /// Attribute normalization includes expanding of general entities (`&entity;`)
57+ /// which replacement text also could contain entities, which is also must be expanded.
58+ /// If more than 128 entities would be expanded, this error is returned.
59+ TooManyNestedEntities ,
5360}
5461
5562impl std:: fmt:: Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673 Self :: InvalidCharRef ( e) => {
6774 write ! ( f, "invalid character reference: {}" , e)
6875 }
76+ Self :: TooManyNestedEntities => {
77+ f. write_str ( "too many nested entities in an attribute value" )
78+ }
6979 }
7080 }
7181}
@@ -419,6 +429,218 @@ fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: c
419429
420430////////////////////////////////////////////////////////////////////////////////////////////////////
421431
432+ const fn is_normalization_char ( b : & u8 ) -> bool {
433+ // The following sequences should be translated into a single `\n` (U+000a) character
434+ // to normalize EOLs:
435+ //
436+ // |UTF-8 |String|
437+ // |--------|------|
438+ // |0d 0a |\r\n |
439+ // |0d c2 85|\r\x85|
440+ // |0d |\r |
441+ // |c2 85 |\x85 |
442+ // |e2 80 a8|\x2028|
443+ matches ! ( * b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&' )
444+ }
445+
446+ /// Returns the attribute value normalized as per [the XML specification],
447+ /// using a custom entity resolver.
448+ ///
449+ /// Do not use this method with HTML attributes.
450+ ///
451+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
452+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
453+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
454+ /// take precedence.
455+ ///
456+ /// This will allocate unless the raw attribute value does not require normalization.
457+ ///
458+ /// # Parameters
459+ ///
460+ /// - `value`: unnormalized attribute value
461+ /// - `depth`: maximum number of nested entities that can be expanded. If expansion
462+ /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
463+ /// - `resolve_entity`: a function to resolve entity. This function could be called
464+ /// multiple times on the same input and can return different values in each case
465+ /// for the same input, although it is not recommended
466+ ///
467+ /// # Lifetimes
468+ ///
469+ /// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
470+ /// the input returned unchanged with the same lifetime
471+ /// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
472+ ///
473+ /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
474+ pub ( crate ) fn normalize_attribute_value < ' input , ' entity , F > (
475+ value : & ' input str ,
476+ depth : usize ,
477+ mut resolve_entity : F ,
478+ ) -> Result < Cow < ' input , str > , EscapeError >
479+ where
480+ // the lifetime of the output comes from a capture or is `'static`
481+ F : FnMut ( & str ) -> Option < & ' entity str > ,
482+ {
483+ let mut iter = value. as_bytes ( ) . iter ( ) ;
484+
485+ // If we found the charater that requires normalization, create a normalized
486+ // version of the attribute, otherwise return the value unchanged
487+ if let Some ( i) = iter. position ( is_normalization_char) {
488+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
489+ let pos = normalize_step (
490+ & mut normalized,
491+ & mut iter,
492+ value,
493+ 0 ,
494+ i,
495+ depth,
496+ & mut resolve_entity,
497+ ) ?;
498+
499+ normalize_steps (
500+ & mut normalized,
501+ & mut iter,
502+ value,
503+ pos,
504+ depth,
505+ & mut resolve_entity,
506+ ) ?;
507+ return Ok ( normalized. into ( ) ) ;
508+ }
509+ Ok ( Cow :: Borrowed ( value) )
510+ }
511+
512+ fn normalize_steps < ' entity , F > (
513+ normalized : & mut String ,
514+ iter : & mut Iter < u8 > ,
515+ input : & str ,
516+ mut pos : usize ,
517+ depth : usize ,
518+ resolve_entity : & mut F ,
519+ ) -> Result < ( ) , EscapeError >
520+ where
521+ // the lifetime of the output comes from a capture or is `'static`
522+ F : FnMut ( & str ) -> Option < & ' entity str > ,
523+ {
524+ while let Some ( i) = iter. position ( is_normalization_char) {
525+ pos = normalize_step ( normalized, iter, input, pos, pos + i, depth, resolve_entity) ?;
526+ }
527+ if let Some ( rest) = input. get ( pos..) {
528+ normalized. push_str ( rest) ;
529+ }
530+ Ok ( ( ) )
531+ }
532+
533+ /// Performs one step of the [normalization algorithm] (but with recursive part):
534+ ///
535+ /// 1. For a character reference, append the referenced character
536+ /// to the normalized value.
537+ /// 2. For an entity reference, recursively apply this algorithm
538+ /// to the replacement text of the entity.
539+ /// 3. For a white space character (#x20, #xD, #xA, #x9), append
540+ /// a space character (#x20) to the normalized value.
541+ /// 4. For another character, append the character to the normalized value.
542+ ///
543+ /// Because [according to the specification], XML parser should parse line-of-end
544+ /// normalized input, but quick-xml does not do that, this function also performs
545+ /// normalization of EOL characters. That should be done before expanding entities
546+ /// and character references, so cannot be processed later.
547+ ///
548+ /// This function could be used also just to normalize line ends if the iterator
549+ /// won't be stop on `&` characters.
550+ ///
551+ /// # Parameters
552+ ///
553+ /// - `normalized`: Output of the algorithm. Normalized value will be placed here
554+ /// - `iter`: Iterator over bytes of `input`
555+ /// - `input`: Original non-normalized value
556+ /// - `last_pos`: Index of the last byte in `input` that was processed
557+ /// - `index`: Index of the byte in `input` that should be processed now
558+ /// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
559+ /// so this parameter tracks if we seen the `\r` before processing the current byte
560+ /// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
561+ /// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
562+ ///
563+ /// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
564+ /// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
565+ fn normalize_step < ' entity , F > (
566+ normalized : & mut String ,
567+ iter : & mut Iter < u8 > ,
568+ input : & str ,
569+ last_pos : usize ,
570+ index : usize ,
571+ depth : usize ,
572+ resolve_entity : & mut F ,
573+ ) -> Result < usize , EscapeError >
574+ where
575+ // the lifetime of the output comes from a capture or is `'static`
576+ F : FnMut ( & str ) -> Option < & ' entity str > ,
577+ {
578+ if depth == 0 {
579+ return Err ( EscapeError :: TooManyNestedEntities ) ;
580+ }
581+ // 4. For another character, append the character to the normalized value.
582+ normalized. push_str ( & input[ last_pos..index] ) ;
583+
584+ match input. as_bytes ( ) [ index] {
585+ b'&' => {
586+ let start = index + 1 ; // +1 - skip `&`
587+ let end = start
588+ + match iter. position ( |& b| b == b';' ) {
589+ Some ( end) => end,
590+ None => return Err ( EscapeError :: UnterminatedEntity ( index..input. len ( ) ) ) ,
591+ } ;
592+
593+ // Content between & and ; - &pat;
594+ // Note, that this content have non-normalized EOLs as required by the specification,
595+ // but because numbers in any case cannot have spaces inside, this is not the problem.
596+ // Normalization of spaces in entity references and checking that they corresponds to
597+ // [`Name`] production on conscience `resolve_entity`.
598+ //
599+ // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
600+ let pat = & input[ start..end] ;
601+ // 1. For a character reference, append the referenced character
602+ // to the normalized value.
603+ if pat. starts_with ( '#' ) {
604+ let entity = & pat[ 1 ..] ; // starts after the #
605+ let codepoint = parse_number ( entity) . map_err ( EscapeError :: InvalidCharRef ) ?;
606+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
607+ } else
608+ // 2. For an entity reference, recursively apply this algorithm
609+ // to the replacement text of the entity.
610+ if let Some ( value) = resolve_entity ( pat) {
611+ normalize_steps (
612+ normalized,
613+ & mut value. as_bytes ( ) . iter ( ) ,
614+ value,
615+ 0 ,
616+ depth. saturating_sub ( 1 ) ,
617+ resolve_entity,
618+ ) ?;
619+ } else {
620+ return Err ( EscapeError :: UnrecognizedEntity ( start..end, pat. to_string ( ) ) ) ;
621+ }
622+ Ok ( end + 1 ) // +1 - skip `;`
623+ }
624+ // 3. For a white space character (#x20, #xD, #xA, #x9), append
625+ // a space character (#x20) to the normalized value.
626+ // Space character has no special meaning, so it is handled on step 4
627+ b'\t' => {
628+ normalized. push ( ' ' ) ;
629+ Ok ( index + 1 ) // +1 - skip \t
630+ }
631+ _ => {
632+ let pos = normalize_eol_step ( normalized, input. as_bytes ( ) , index, ' ' ) ;
633+ // We should advance iterator because we may skip several characters
634+ for _ in 0 ..pos - index - 1 {
635+ iter. next ( ) ;
636+ }
637+ Ok ( pos)
638+ }
639+ }
640+ }
641+
642+ ////////////////////////////////////////////////////////////////////////////////////////////////////
643+
422644/// Resolves predefined XML entities or all HTML5 entities depending on the feature
423645/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
424646///
@@ -2022,4 +2244,119 @@ mod normalization {
20222244 ) ;
20232245 }
20242246 }
2247+
2248+ mod attribute {
2249+ use super :: * ;
2250+ use pretty_assertions:: assert_eq;
2251+
2252+ #[ test]
2253+ fn empty ( ) {
2254+ assert_eq ! (
2255+ normalize_attribute_value( "" , 5 , |_| { None } ) ,
2256+ Ok ( "" . into( ) )
2257+ ) ;
2258+ }
2259+
2260+ #[ test]
2261+ fn only_spaces ( ) {
2262+ assert_eq ! (
2263+ normalize_attribute_value( " " , 5 , |_| { None } ) ,
2264+ Ok ( " " . into( ) )
2265+ ) ;
2266+ assert_eq ! (
2267+ normalize_attribute_value( "\t \t \t " , 5 , |_| { None } ) ,
2268+ Ok ( " " . into( ) )
2269+ ) ;
2270+ assert_eq ! (
2271+ normalize_attribute_value( "\r \r \r " , 5 , |_| { None } ) ,
2272+ Ok ( " " . into( ) )
2273+ ) ;
2274+ assert_eq ! (
2275+ normalize_attribute_value( "\n \n \n " , 5 , |_| { None } ) ,
2276+ Ok ( " " . into( ) )
2277+ ) ;
2278+ }
2279+
2280+ #[ test]
2281+ fn already_normalized ( ) {
2282+ assert_eq ! (
2283+ normalize_attribute_value( "already normalized" , 5 , |_| { None } ) ,
2284+ Ok ( "already normalized" . into( ) )
2285+ ) ;
2286+ }
2287+
2288+ #[ test]
2289+ fn characters ( ) {
2290+ assert_eq ! (
2291+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2292+ Ok ( "string with character" . into( ) )
2293+ ) ;
2294+ assert_eq ! (
2295+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2296+ Ok ( "string with character" . into( ) )
2297+ ) ;
2298+ }
2299+
2300+ #[ test]
2301+ fn entities ( ) {
2302+ assert_eq ! (
2303+ normalize_attribute_value( "string with &entity; reference" , 5 , |_| {
2304+ Some ( "replacement" )
2305+ } ) ,
2306+ Ok ( "string with replacement reference" . into( ) )
2307+ ) ;
2308+ assert_eq ! (
2309+ normalize_attribute_value( "string with &entity-1; reference" , 5 , |entity| {
2310+ match entity {
2311+ "entity-1" => Some ( "recursive &entity-2;" ) ,
2312+ "entity-2" => Some ( "entity 2" ) ,
2313+ _ => None ,
2314+ }
2315+ } ) ,
2316+ Ok ( "string with recursive entity 2 reference" . into( ) )
2317+ ) ;
2318+ }
2319+
2320+ #[ test]
2321+ fn unclosed_entity ( ) {
2322+ assert_eq ! (
2323+ normalize_attribute_value( "string with unclosed &entity reference" , 5 , |_| {
2324+ // 0 ^ = 21 ^ = 38
2325+ Some ( "replacement" )
2326+ } ) ,
2327+ Err ( EscapeError :: UnterminatedEntity ( 21 ..38 ) )
2328+ ) ;
2329+ assert_eq ! (
2330+ normalize_attribute_value(
2331+ "string with unclosed   (character) reference" ,
2332+ // ^ = 21 ^ = 47
2333+ 5 ,
2334+ |_| { None }
2335+ ) ,
2336+ Err ( EscapeError :: UnterminatedEntity ( 21 ..47 ) )
2337+ ) ;
2338+ }
2339+
2340+ #[ test]
2341+ fn unknown_entity ( ) {
2342+ assert_eq ! (
2343+ normalize_attribute_value( "string with unknown &entity; reference" , 5 , |_| {
2344+ // 0 ^ ^ = 21..27
2345+ None
2346+ } ) ,
2347+ Err ( EscapeError :: UnrecognizedEntity (
2348+ 21 ..27 ,
2349+ "entity" . to_string( ) ,
2350+ ) )
2351+ ) ;
2352+ }
2353+
2354+ #[ test]
2355+ fn recursive_entity ( ) {
2356+ assert_eq ! (
2357+ normalize_attribute_value( "&entity; reference" , 5 , |_| Some ( "recursive &entity;" ) ) ,
2358+ Err ( EscapeError :: TooManyNestedEntities ) ,
2359+ ) ;
2360+ }
2361+ }
20252362}
0 commit comments