@@ -24,66 +24,156 @@ const U_REP = '\u{FFFD}'
2424pub fn decode (
2525 bytes : BytesView ,
2626 ignore_bom ? : Bool = false ,
27+ endianness ? : Endian = Little ,
2728) -> String raise Malformed {
28- let bytes = if ignore_bom && bytes is [.. "\xff\xfe " , .. rest ] {
29- rest
29+ let bytes = if ignore_bom {
30+ if endianness is Little && bytes is [.. "\xff\xfe " , .. rest ] {
31+ rest
32+ } else if endianness is Big && bytes is [.. "\xfe\xff " , .. rest ] {
33+ rest
34+ } else {
35+ bytes
36+ }
3037 } else {
3138 bytes
3239 }
33- // check the string
34- loop bytes {
35- [] => ()
36- [u16le (0xD800. .= 0xDBFF as higher ), u16le (0xDC00. .= 0xDFFF as lower ), .. rest ] as bytes => {
37- if ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
38- (lower .reinterpret_as_int () - 0xDC00 ) +
39- 0x10000 >
40- 0x10FFFF {
41- raise Malformed (bytes )
40+ if endianness is Little {
41+ // check the string
42+ loop bytes {
43+ [] => ()
44+ [
45+ u16le (0xD800. .= 0xDBFF as higher ),
46+ u16le (0xDC00. .= 0xDFFF as lower ),
47+ .. rest ,
48+ ] as bytes => {
49+ if ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
50+ (lower .reinterpret_as_int () - 0xDC00 ) +
51+ 0x10000 >
52+ 0x10FFFF {
53+ raise Malformed (bytes )
54+ }
55+ continue rest
4256 }
43- continue rest
57+ [u16le (0xD800. .= 0xDFFF ), ..] as bytes => raise Malformed (bytes )
58+ [u16le (_ ), .. rest ] => continue rest
59+ _ as bytes => raise Malformed (bytes )
4460 }
45- [u16le (0xD800. .= 0xDFFF ), ..] as bytes => raise Malformed (bytes )
46- [u16le (_ ), .. rest ] => continue rest
47- _ as bytes => raise Malformed (bytes )
61+ bytes
62+ .data ()
63+ .to_unchecked_string (offset = bytes .start_offset (), length = bytes .length ())
64+ } else {
65+ let string_bytes = FixedArray ::make (bytes .length (), b '\x00 ' )
66+ let mut i = 0
67+ loop bytes {
68+ [] => ()
69+ [
70+ u16be (0xD800. .= 0xDBFF as higher ),
71+ u16be (0xDC00. .= 0xDFFF as lower ),
72+ .. rest ,
73+ ] as bytes => {
74+ if ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
75+ (lower .reinterpret_as_int () - 0xDC00 ) +
76+ 0x10000 >
77+ 0x10FFFF {
78+ raise Malformed (bytes )
79+ }
80+ string_bytes [i ] = (higher & 0xFF ).to_byte ()
81+ string_bytes [i + 1 ] = (higher >> 8 ).to_byte ()
82+ string_bytes [i + 2 ] = (lower & 0xFF ).to_byte ()
83+ string_bytes [i + 3 ] = (lower >> 8 ).to_byte ()
84+ i + = 4
85+ continue rest
86+ }
87+ [u16be (0xD800. .= 0xDFFF ), ..] as bytes => raise Malformed (bytes )
88+ [u16be (code_unit ), .. rest ] => {
89+ string_bytes [i ] = (code_unit & 0xFF ).to_byte ()
90+ string_bytes [i + 1 ] = (code_unit >> 8 ).to_byte ()
91+ i + = 2
92+ continue rest
93+ }
94+ _ as bytes => raise Malformed (bytes )
95+ }
96+ string_bytes .unsafe_reinterpret_as_bytes ().to_unchecked_string ()
4897 }
49- bytes
50- .data ()
51- .to_unchecked_string (offset = bytes .start_offset (), length = bytes .length ())
5298}
5399
54100///|
55101///
56102/// References :
57103/// - https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G66453
58- pub fn decode_lossy (bytes : BytesView , ignore_bom ? : Bool = false ) -> String {
59- let bytes = if ignore_bom && bytes is [.. "\xff\xfe " , .. rest ] {
60- rest
104+ pub fn decode_lossy (
105+ bytes : BytesView ,
106+ ignore_bom ? : Bool = false ,
107+ endianness ? : Endian = Little ,
108+ ) -> String {
109+ let bytes = if ignore_bom {
110+ if endianness is Little && bytes is [.. "\xff\xfe " , .. rest ] {
111+ rest
112+ } else if endianness is Big && bytes is [.. "\xfe\xff " , .. rest ] {
113+ rest
114+ } else {
115+ bytes
116+ }
61117 } else {
62118 bytes
63119 }
64120 let builder = StringBuilder ::new (size_hint = bytes .length ())
65- loop bytes {
66- [] => ()
67- [u16le (0xD800. .= 0xDBFF as higher ), u16le (0xDC00. .= 0xDFFF as lower ), .. rest ] => {
68- let ch = ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
69- (lower .reinterpret_as_int () - 0xDC00 ) +
70- 0x10000
71- if ch > 0x10FFFF {
121+ if endianness is Little {
122+ loop bytes {
123+ [] => ()
124+ [
125+ u16le (0xD800. .= 0xDBFF as higher ),
126+ u16le (0xDC00. .= 0xDFFF as lower ),
127+ .. rest ,
128+ ] => {
129+ let ch = ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
130+ (lower .reinterpret_as_int () - 0xDC00 ) +
131+ 0x10000
132+ if ch > 0x10FFFF {
133+ builder .write_char (U_REP )
134+ } else {
135+ builder .write_char (ch .unsafe_to_char ())
136+ }
137+ continue rest
138+ }
139+ [u16le (0xD800. .= 0xDFFF ), .. rest ] => {
72140 builder .write_char (U_REP )
73- } else {
74- builder .write_char (ch .unsafe_to_char ())
141+ continue rest
75142 }
76- continue rest
77- }
78- [ u16le ( 0xD800. . = 0xDFFF ), .. rest ] => {
79- builder . write_char ( U_REP )
80- continue rest
143+ [ u16le ( ch ), .. rest ] => {
144+ builder . write_char ( ch . reinterpret_as_int (). unsafe_to_char ())
145+ continue rest
146+ }
147+ _ => builder . write_char ( U_REP )
81148 }
82- [u16le (ch ), .. rest ] => {
83- builder .write_char (ch .reinterpret_as_int ().unsafe_to_char ())
84- continue rest
149+ } else {
150+ loop bytes {
151+ [] => ()
152+ [
153+ u16be (0xD800. .= 0xDBFF as higher ),
154+ u16be (0xDC00. .= 0xDFFF as lower ),
155+ .. rest ,
156+ ] => {
157+ let ch = ((higher .reinterpret_as_int () - 0xD800 ) << 10 ) +
158+ (lower .reinterpret_as_int () - 0xDC00 ) +
159+ 0x10000
160+ if ch > 0x10FFFF {
161+ builder .write_char (U_REP )
162+ } else {
163+ builder .write_char (ch .unsafe_to_char ())
164+ }
165+ continue rest
166+ }
167+ [u16be (0xD800. .= 0xDFFF ), .. rest ] => {
168+ builder .write_char (U_REP )
169+ continue rest
170+ }
171+ [u16be (ch ), .. rest ] => {
172+ builder .write_char (ch .reinterpret_as_int ().unsafe_to_char ())
173+ continue rest
174+ }
175+ _ => builder .write_char (U_REP )
85176 }
86- _ => builder .write_char (U_REP )
87177 }
88178 builder .to_string ()
89179}
0 commit comments