@@ -4,6 +4,7 @@ use memchr::memchr2_iter;
44use std:: borrow:: Cow ;
55use std:: num:: ParseIntError ;
66use std:: ops:: Range ;
7+ use std:: slice:: Iter ;
78
89/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910#[ derive( Clone , Debug , PartialEq ) ]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051 /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152 /// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253 InvalidCharRef ( ParseCharRefError ) ,
54+ /// Expanded more than maximum possible entities during attribute normalization.
55+ ///
56+ /// Attribute normalization includes expanding of general entities (`&entity;`)
57+ /// which replacement text also could contain entities, which is also must be expanded.
58+ /// If more than 128 entities would be expanded, this error is returned.
59+ TooManyNestedEntities ,
5360}
5461
5562impl std:: fmt:: Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673 Self :: InvalidCharRef ( e) => {
6774 write ! ( f, "invalid character reference: {}" , e)
6875 }
76+ Self :: TooManyNestedEntities => {
77+ f. write_str ( "too many nested entities in an attribute value" )
78+ }
6979 }
7080 }
7181}
@@ -302,6 +312,182 @@ where
302312 }
303313}
304314
315+ const fn is_normalization_char ( b : & u8 ) -> bool {
316+ matches ! ( * b, b'\t' | b'\r' | b'\n' | b' ' | b'&' )
317+ }
318+
319+ /// Returns the attribute value normalized as per [the XML specification],
320+ /// using a custom entity resolver.
321+ ///
322+ /// Do not use this method with HTML attributes.
323+ ///
324+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
325+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
326+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
327+ /// take precedence.
328+ ///
329+ /// This will allocate unless the raw attribute value does not require normalization.
330+ ///
331+ /// # Parameters
332+ ///
333+ /// - `value`: unnormalized attribute value
334+ /// - `depth`: maximum number of nested entities that can be expanded. If expansion
335+ /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
336+ /// - `resolve_entity`: a function to resolve entity. This function could be called
337+ /// multiple times on the same input and can return different values in each case
338+ /// for the same input, although it is not recommended
339+ ///
340+ /// # Lifetimes
341+ ///
342+ /// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
343+ /// the input returned unchanged with the same lifetime
344+ /// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
345+ ///
346+ /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
347+ pub ( crate ) fn normalize_attribute_value < ' input , ' entity , F > (
348+ value : & ' input str ,
349+ depth : usize ,
350+ resolve_entity : F ,
351+ ) -> Result < Cow < ' input , str > , EscapeError >
352+ where
353+ // the lifetime of the output comes from a capture or is `'static`
354+ F : Fn ( & str ) -> Option < & ' entity str > ,
355+ {
356+ let mut iter = value. as_bytes ( ) . iter ( ) ;
357+
358+ // If we found the charater that requires normalization, create a normalized
359+ // version of the attribute, otherwise return the value unchanged
360+ if let Some ( i) = iter. position ( is_normalization_char) {
361+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
362+ let pos = normalize_step (
363+ & mut normalized,
364+ & mut iter,
365+ value,
366+ 0 ,
367+ i,
368+ depth,
369+ & resolve_entity,
370+ ) ?;
371+
372+ normalize_steps (
373+ & mut normalized,
374+ & mut iter,
375+ value,
376+ pos,
377+ depth,
378+ & resolve_entity,
379+ ) ?;
380+ return Ok ( normalized. into ( ) ) ;
381+ }
382+ Ok ( Cow :: Borrowed ( value) )
383+ }
384+
385+ fn normalize_steps < ' entity , F > (
386+ normalized : & mut String ,
387+ iter : & mut Iter < u8 > ,
388+ input : & str ,
389+ mut pos : usize ,
390+ depth : usize ,
391+ resolve_entity : & F ,
392+ ) -> Result < ( ) , EscapeError >
393+ where
394+ // the lifetime of the output comes from a capture or is `'static`
395+ F : Fn ( & str ) -> Option < & ' entity str > ,
396+ {
397+ while let Some ( i) = iter. position ( is_normalization_char) {
398+ pos = normalize_step ( normalized, iter, input, pos, pos + i, depth, resolve_entity) ?;
399+ }
400+ if let Some ( rest) = input. get ( pos..) {
401+ normalized. push_str ( rest) ;
402+ }
403+ Ok ( ( ) )
404+ }
405+
406+ /// Performs one step of the [normalization algorithm] (but with recursive part):
407+ ///
408+ /// 1. For a character reference, append the referenced character
409+ /// to the normalized value.
410+ /// 2. For an entity reference, recursively apply this algorithm
411+ /// to the replacement text of the entity.
412+ /// 3. For a white space character (#x20, #xD, #xA, #x9), append
413+ /// a space character (#x20) to the normalized value.
414+ /// 4. For another character, append the character to the normalized value.
415+ ///
416+ /// # Parameters
417+ ///
418+ /// - `normalized`: Output of the algorithm. Normalized value will be placed here
419+ /// - `iter`: Iterator over bytes of `input`
420+ /// - `input`: Original non-normalized value
421+ /// - `last_pos`: Index of the last byte in `input` that was processed
422+ /// - `index`: Index of the byte in `input` that should be processed now
423+ /// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
424+ /// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
425+ ///
426+ /// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
427+ fn normalize_step < ' entity , F > (
428+ normalized : & mut String ,
429+ iter : & mut Iter < u8 > ,
430+ input : & str ,
431+ last_pos : usize ,
432+ index : usize ,
433+ depth : usize ,
434+ resolve_entity : & F ,
435+ ) -> Result < usize , EscapeError >
436+ where
437+ // the lifetime of the output comes from a capture or is `'static`
438+ F : Fn ( & str ) -> Option < & ' entity str > ,
439+ {
440+ if depth == 0 {
441+ return Err ( EscapeError :: TooManyNestedEntities ) ;
442+ }
443+ // 4. For another character, append the character to the normalized value.
444+ normalized. push_str ( & input[ last_pos..index] ) ;
445+
446+ match input. as_bytes ( ) [ index] {
447+ b'&' => {
448+ let start = index + 1 ; // +1 - skip `&`
449+ let end = start
450+ + match iter. position ( |& b| b == b';' ) {
451+ Some ( end) => end,
452+ None => return Err ( EscapeError :: UnterminatedEntity ( index..input. len ( ) ) ) ,
453+ } ;
454+
455+ // Content between & and ; - &pat;
456+ let pat = & input[ start..end] ;
457+ // 1. For a character reference, append the referenced character
458+ // to the normalized value.
459+ if pat. starts_with ( '#' ) {
460+ let entity = & pat[ 1 ..] ; // starts after the #
461+ let codepoint = parse_number ( entity) . map_err ( EscapeError :: InvalidCharRef ) ?;
462+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
463+ } else
464+ // 2. For an entity reference, recursively apply this algorithm
465+ // to the replacement text of the entity.
466+ if let Some ( value) = resolve_entity ( pat) {
467+ normalize_steps (
468+ normalized,
469+ & mut value. as_bytes ( ) . iter ( ) ,
470+ value,
471+ 0 ,
472+ depth. saturating_sub ( 1 ) ,
473+ resolve_entity,
474+ ) ?;
475+ } else {
476+ return Err ( EscapeError :: UnrecognizedEntity ( start..end, pat. to_string ( ) ) ) ;
477+ }
478+ Ok ( end + 1 ) // +1 - skip `;`
479+ }
480+ // 3. For a white space character (#x20, #xD, #xA, #x9), append
481+ // a space character (#x20) to the normalized value.
482+ b'\t' | b'\n' | b'\r' | b' ' => {
483+ normalized. push ( ' ' ) ;
484+ Ok ( index + 1 ) // +1 - skip character
485+ }
486+
487+ _ => unreachable ! ( "Only '\\ t', '\\ n', '\\ r', ' ', and '&' are possible here" ) ,
488+ }
489+ }
490+
305491/// Resolves predefined XML entities or all HTML5 entities depending on the feature
306492/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
307493///
@@ -1844,3 +2030,115 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
18442030 _ => u32:: from_str_radix ( src, radix) . map_err ( ParseCharRefError :: InvalidNumber ) ,
18452031 }
18462032}
2033+
2034+ #[ cfg( test) ]
2035+ mod normalization {
2036+ use super :: * ;
2037+ use pretty_assertions:: assert_eq;
2038+
2039+ #[ test]
2040+ fn empty ( ) {
2041+ assert_eq ! (
2042+ normalize_attribute_value( "" , 5 , |_| { None } ) ,
2043+ Ok ( "" . into( ) )
2044+ ) ;
2045+ }
2046+
2047+ #[ test]
2048+ fn only_spaces ( ) {
2049+ assert_eq ! (
2050+ normalize_attribute_value( " " , 5 , |_| { None } ) ,
2051+ Ok ( " " . into( ) )
2052+ ) ;
2053+ assert_eq ! (
2054+ normalize_attribute_value( "\t \t \t " , 5 , |_| { None } ) ,
2055+ Ok ( " " . into( ) )
2056+ ) ;
2057+ assert_eq ! (
2058+ normalize_attribute_value( "\r \r \r " , 5 , |_| { None } ) ,
2059+ Ok ( " " . into( ) )
2060+ ) ;
2061+ assert_eq ! (
2062+ normalize_attribute_value( "\n \n \n " , 5 , |_| { None } ) ,
2063+ Ok ( " " . into( ) )
2064+ ) ;
2065+ }
2066+
2067+ #[ test]
2068+ fn already_normalized ( ) {
2069+ assert_eq ! (
2070+ normalize_attribute_value( "already normalized" , 5 , |_| { None } ) ,
2071+ Ok ( "already normalized" . into( ) )
2072+ ) ;
2073+ }
2074+
2075+ #[ test]
2076+ fn characters ( ) {
2077+ assert_eq ! (
2078+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2079+ Ok ( "string with character" . into( ) )
2080+ ) ;
2081+ assert_eq ! (
2082+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2083+ Ok ( "string with character" . into( ) )
2084+ ) ;
2085+ }
2086+
2087+ #[ test]
2088+ fn entities ( ) {
2089+ assert_eq ! (
2090+ normalize_attribute_value( "string with &entity; reference" , 5 , |_| {
2091+ Some ( "replacement" )
2092+ } ) ,
2093+ Ok ( "string with replacement reference" . into( ) )
2094+ ) ;
2095+ assert_eq ! (
2096+ normalize_attribute_value( "string with &entity-1; reference" , 5 , |entity| {
2097+ match entity {
2098+ "entity-1" => Some ( "recursive &entity-2;" ) ,
2099+ "entity-2" => Some ( "entity 2" ) ,
2100+ _ => None ,
2101+ }
2102+ } ) ,
2103+ Ok ( "string with recursive entity 2 reference" . into( ) )
2104+ ) ;
2105+ }
2106+
2107+ #[ test]
2108+ fn unclosed_entity ( ) {
2109+ assert_eq ! (
2110+ normalize_attribute_value( "string with unclosed &entity reference" , 5 , |_| {
2111+ // 0 ^ = 21 ^ = 38
2112+ Some ( "replacement" )
2113+ } ) ,
2114+ Err ( EscapeError :: UnterminatedEntity ( 21 ..38 ) )
2115+ ) ;
2116+ assert_eq ! (
2117+ normalize_attribute_value( "string with unclosed   (character) reference" , 5 , |_| {
2118+ // 0 ^ = 21 ^ = 47
2119+ None
2120+ } ) ,
2121+ Err ( EscapeError :: UnterminatedEntity ( 21 ..47 ) )
2122+ ) ;
2123+ }
2124+
2125+ #[ test]
2126+ fn unknown_entity ( ) {
2127+ assert_eq ! (
2128+ normalize_attribute_value( "string with unknown &entity; reference" , 5 , |_| { None } ) ,
2129+ // 0 ^ ^ = 21..27
2130+ Err ( EscapeError :: UnrecognizedEntity (
2131+ 21 ..27 ,
2132+ "entity" . to_string( ) ,
2133+ ) )
2134+ ) ;
2135+ }
2136+
2137+ #[ test]
2138+ fn recursive_entity ( ) {
2139+ assert_eq ! (
2140+ normalize_attribute_value( "&entity; reference" , 5 , |_| Some ( "recursive &entity;" ) ) ,
2141+ Err ( EscapeError :: TooManyNestedEntities ) ,
2142+ ) ;
2143+ }
2144+ }
0 commit comments