@@ -305,7 +305,7 @@ where
305305////////////////////////////////////////////////////////////////////////////////////////////////////
306306
307307// TODO: It would be better to reuse buffer after decoding if possible
308- pub ( crate ) fn normalize_xml_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
308+ pub ( crate ) fn normalize_xml11_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
309309 let bytes = text. as_bytes ( ) ;
310310
311311 // The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326 // we are sure that index within string
327327 normalized. push_str ( & text[ 0 ..i] ) ;
328328
329- let mut pos = normalize_xml_eol_step ( & mut normalized, text, i, '\n' ) ;
329+ let mut pos = normalize_xml11_eol_step ( & mut normalized, text, i, '\n' ) ;
330330 while let Some ( i) = memchr3 ( b'\r' , 0xC2 , 0xE2 , & bytes[ pos..] ) {
331331 let index = pos + i;
332332 // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333333 // we are sure that index within string
334334 normalized. push_str ( & text[ pos..index] ) ;
335- pos = normalize_xml_eol_step ( & mut normalized, text, index, '\n' ) ;
335+ pos = normalize_xml11_eol_step ( & mut normalized, text, index, '\n' ) ;
336336 }
337337 if let Some ( rest) = text. get ( pos..) {
338338 normalized. push_str ( rest) ;
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378///
379379/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380380/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381- fn normalize_xml_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
381+ fn normalize_xml11_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
382382 let input = text. as_bytes ( ) ;
383383 match input[ index] {
384384 b'\r' => {
@@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
388388 normalized. push ( ch) ;
389389 return index + 2 ; // skip \r\n
390390 }
391- // Because input is correct UTF-8 and in UTF-8 every character has
392- // an unique prefix, byte C2 means only start of #x85 character
393391 if next == 0xC2 {
392+ // UTF-8 encoding of #x85 character is [c2 85]
394393 if index + 2 < input. len ( ) && input[ index + 2 ] == 0x85 {
395394 normalized. push ( ch) ;
396395 } else {
396+ normalized. push ( ch) ;
397397 // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
398398 // we are sure that index within string
399- normalized. push_str ( & text[ index..index + 3 ] ) ;
399+ normalized. push_str ( & text[ index + 1 ..index + 3 ] ) ;
400400 }
401401 return index + 3 ; // skip \r + UTF-8 encoding of character (c2 xx)
402402 }
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
441441////////////////////////////////////////////////////////////////////////////////////////////////////
442442
443443// TODO: It would be better to reuse buffer after decoding if possible
444- pub ( crate ) fn normalize_html_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
444+ pub ( crate ) fn normalize_xml10_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
445445 let bytes = text. as_bytes ( ) ;
446446
447447 // The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
459459 // we are sure that index within string
460460 normalized. push_str ( & text[ 0 ..i] ) ;
461461
462- let mut pos = normalize_html_eol_step ( & mut normalized, bytes, i, '\n' ) ;
462+ let mut pos = normalize_xml10_eol_step ( & mut normalized, bytes, i, '\n' ) ;
463463 while let Some ( i) = memchr ( b'\r' , & bytes[ pos..] ) {
464464 let index = pos + i;
465465 // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
466466 // we are sure that index within string
467467 normalized. push_str ( & text[ pos..index] ) ;
468- pos = normalize_html_eol_step ( & mut normalized, bytes, index, '\n' ) ;
468+ pos = normalize_xml10_eol_step ( & mut normalized, bytes, index, '\n' ) ;
469469 }
470470 if let Some ( rest) = text. get ( pos..) {
471471 normalized. push_str ( rest) ;
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
487487/// - `ch`: a character that should be put to the string instead of newline sequence
488488///
489489/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
490- fn normalize_html_eol_step ( normalized : & mut String , input : & [ u8 ] , index : usize , ch : char ) -> usize {
490+ fn normalize_xml10_eol_step (
491+ normalized : & mut String ,
492+ input : & [ u8 ] ,
493+ index : usize ,
494+ ch : char ,
495+ ) -> usize {
491496 match input[ index] {
492497 b'\r' => {
493498 normalized. push ( ch) ;
@@ -2062,56 +2067,59 @@ mod normalization {
20622067 mod eol {
20632068 use super :: * ;
20642069
2065- mod xml {
2070+ mod xml11 {
20662071 use super :: * ;
20672072 use pretty_assertions:: assert_eq;
20682073
20692074 #[ test]
20702075 fn empty ( ) {
2071- assert_eq ! ( normalize_xml_eols ( "" ) , "" ) ;
2076+ assert_eq ! ( normalize_xml11_eols ( "" ) , "" ) ;
20722077 }
20732078
20742079 #[ test]
20752080 fn already_normalized ( ) {
20762081 assert_eq ! (
2077- normalize_xml_eols ( "\n already \n \n normalized\n " ) ,
2082+ normalize_xml11_eols ( "\n already \n \n normalized\n " ) ,
20782083 "\n already \n \n normalized\n " ,
20792084 ) ;
20802085 }
20812086
20822087 #[ test]
20832088 fn cr_lf ( ) {
2084- assert_eq ! ( normalize_xml_eols( "\r \n some\r \n \r \n text" ) , "\n some\n \n text" ) ;
2089+ assert_eq ! (
2090+ normalize_xml11_eols( "\r \n some\r \n \r \n text" ) ,
2091+ "\n some\n \n text"
2092+ ) ;
20852093 }
20862094
20872095 #[ test]
20882096 fn cr_u0085 ( ) {
20892097 assert_eq ! (
2090- normalize_xml_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2098+ normalize_xml11_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
20912099 "\n some\n \n text" ,
20922100 ) ;
20932101 }
20942102
20952103 #[ test]
20962104 fn u0085 ( ) {
20972105 assert_eq ! (
2098- normalize_xml_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2106+ normalize_xml11_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
20992107 "\n some\n \n text" ,
21002108 ) ;
21012109 }
21022110
21032111 #[ test]
21042112 fn u2028 ( ) {
21052113 assert_eq ! (
2106- normalize_xml_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2114+ normalize_xml11_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
21072115 "\n some\n \n text" ,
21082116 ) ;
21092117 }
21102118
21112119 #[ test]
21122120 fn mixed ( ) {
21132121 assert_eq ! (
2114- normalize_xml_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2122+ normalize_xml11_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
21152123 "\n \n \n \n \n \n some\n \n \n text" ,
21162124 ) ;
21172125 }
@@ -2138,9 +2146,9 @@ mod normalization {
21382146
21392147 dbg ! ( ( input, & description) ) ;
21402148 if ch == '\u{0085}' {
2141- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2149+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
21422150 } else {
2143- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2151+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
21442152 }
21452153 }
21462154 assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2171,9 +2179,12 @@ mod normalization {
21712179
21722180 dbg ! ( ( input, & description) ) ;
21732181 if ch == '\u{0085}' {
2174- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2182+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
21752183 } else {
2176- assert_eq ! ( normalize_xml_eols( input) , input, "{}" , description) ;
2184+ let mut expected = utf8. clone ( ) ;
2185+ expected[ 0 ] = b'\n' ;
2186+ let expected = std:: str:: from_utf8 ( & expected) . expect ( & description) ;
2187+ assert_eq ! ( normalize_xml11_eols( input) , expected, "{}" , description) ;
21772188 }
21782189 }
21792190 assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2204,68 +2215,68 @@ mod normalization {
22042215
22052216 dbg ! ( ( input, & description) ) ;
22062217 if ch == '\u{2028}' {
2207- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2218+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
22082219 } else {
2209- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2220+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
22102221 }
22112222 }
22122223 assert_eq ! ( ( first..=last) . count( ) , 4096 ) ;
22132224 }
22142225 }
22152226
2216- mod html {
2227+ mod xml10 {
22172228 use super :: * ;
22182229 use pretty_assertions:: assert_eq;
22192230
22202231 #[ test]
22212232 fn empty ( ) {
2222- assert_eq ! ( normalize_html_eols ( "" ) , "" ) ;
2233+ assert_eq ! ( normalize_xml10_eols ( "" ) , "" ) ;
22232234 }
22242235
22252236 #[ test]
22262237 fn already_normalized ( ) {
22272238 assert_eq ! (
2228- normalize_html_eols ( "\n already \n \n normalized\n " ) ,
2239+ normalize_xml10_eols ( "\n already \n \n normalized\n " ) ,
22292240 "\n already \n \n normalized\n " ,
22302241 ) ;
22312242 }
22322243
22332244 #[ test]
22342245 fn cr_lf ( ) {
22352246 assert_eq ! (
2236- normalize_html_eols ( "\r \n some\r \n \r \n text" ) ,
2247+ normalize_xml10_eols ( "\r \n some\r \n \r \n text" ) ,
22372248 "\n some\n \n text"
22382249 ) ;
22392250 }
22402251
22412252 #[ test]
22422253 fn cr_u0085 ( ) {
22432254 assert_eq ! (
2244- normalize_html_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2255+ normalize_xml10_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
22452256 "\n \u{0085} some\n \u{0085} \n \u{0085} text" ,
22462257 ) ;
22472258 }
22482259
22492260 #[ test]
22502261 fn u0085 ( ) {
22512262 assert_eq ! (
2252- normalize_html_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2263+ normalize_xml10_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
22532264 "\u{0085} some\u{0085} \u{0085} text" ,
22542265 ) ;
22552266 }
22562267
22572268 #[ test]
22582269 fn u2028 ( ) {
22592270 assert_eq ! (
2260- normalize_html_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2271+ normalize_xml10_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
22612272 "\u{2028} some\u{2028} \u{2028} text" ,
22622273 ) ;
22632274 }
22642275
22652276 #[ test]
22662277 fn mixed ( ) {
22672278 assert_eq ! (
2268- normalize_html_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2279+ normalize_xml10_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
22692280 "\n \n \n \u{2028} \n \n some\n \u{0085} \n \u{0085} text" ,
22702281 ) ;
22712282 }
0 commit comments