Skip to content

Commit 37a44a6

Browse files
feat: add utf16be support
1 parent 2fd6675 commit 37a44a6

File tree

6 files changed

+259
-49
lines changed

6 files changed

+259
-49
lines changed

encoding/utf16/decode.mbt

Lines changed: 129 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -24,66 +24,156 @@ const U_REP = '\u{FFFD}'
2424
pub fn decode(
2525
bytes : BytesView,
2626
ignore_bom? : Bool = false,
27+
endianness? : Endian = Little,
2728
) -> String raise Malformed {
28-
let bytes = if ignore_bom && bytes is [.. "\xff\xfe", .. rest] {
29-
rest
29+
let bytes = if ignore_bom {
30+
if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
31+
rest
32+
} else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
33+
rest
34+
} else {
35+
bytes
36+
}
3037
} else {
3138
bytes
3239
}
33-
// check the string
34-
loop bytes {
35-
[] => ()
36-
[u16le(0xD800..=0xDBFF as higher), u16le(0xDC00..=0xDFFF as lower), .. rest] as bytes => {
37-
if ((higher.reinterpret_as_int() - 0xD800) << 10) +
38-
(lower.reinterpret_as_int() - 0xDC00) +
39-
0x10000 >
40-
0x10FFFF {
41-
raise Malformed(bytes)
40+
if endianness is Little {
41+
// check the string
42+
loop bytes {
43+
[] => ()
44+
[
45+
u16le(0xD800..=0xDBFF as higher),
46+
u16le(0xDC00..=0xDFFF as lower),
47+
.. rest,
48+
] as bytes => {
49+
if ((higher.reinterpret_as_int() - 0xD800) << 10) +
50+
(lower.reinterpret_as_int() - 0xDC00) +
51+
0x10000 >
52+
0x10FFFF {
53+
raise Malformed(bytes)
54+
}
55+
continue rest
4256
}
43-
continue rest
57+
[u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
58+
[u16le(_), .. rest] => continue rest
59+
_ as bytes => raise Malformed(bytes)
4460
}
45-
[u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
46-
[u16le(_), .. rest] => continue rest
47-
_ as bytes => raise Malformed(bytes)
61+
bytes
62+
.data()
63+
.to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
64+
} else {
65+
let string_bytes = FixedArray::make(bytes.length(), b'\x00')
66+
let mut i = 0
67+
loop bytes {
68+
[] => ()
69+
[
70+
u16be(0xD800..=0xDBFF as higher),
71+
u16be(0xDC00..=0xDFFF as lower),
72+
.. rest,
73+
] as bytes => {
74+
if ((higher.reinterpret_as_int() - 0xD800) << 10) +
75+
(lower.reinterpret_as_int() - 0xDC00) +
76+
0x10000 >
77+
0x10FFFF {
78+
raise Malformed(bytes)
79+
}
80+
string_bytes[i] = (higher & 0xFF).to_byte()
81+
string_bytes[i + 1] = (higher >> 8).to_byte()
82+
string_bytes[i + 2] = (lower & 0xFF).to_byte()
83+
string_bytes[i + 3] = (lower >> 8).to_byte()
84+
i += 4
85+
continue rest
86+
}
87+
[u16be(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
88+
[u16be(code_unit), .. rest] => {
89+
string_bytes[i] = (code_unit & 0xFF).to_byte()
90+
string_bytes[i + 1] = (code_unit >> 8).to_byte()
91+
i += 2
92+
continue rest
93+
}
94+
_ as bytes => raise Malformed(bytes)
95+
}
96+
string_bytes.unsafe_reinterpret_as_bytes().to_unchecked_string()
4897
}
49-
bytes
50-
.data()
51-
.to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
5298
}
5399

54100
///|
55101
///
56102
/// References :
57103
/// - https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G66453
58-
pub fn decode_lossy(bytes : BytesView, ignore_bom? : Bool = false) -> String {
59-
let bytes = if ignore_bom && bytes is [.. "\xff\xfe", .. rest] {
60-
rest
104+
pub fn decode_lossy(
105+
bytes : BytesView,
106+
ignore_bom? : Bool = false,
107+
endianness? : Endian = Little,
108+
) -> String {
109+
let bytes = if ignore_bom {
110+
if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
111+
rest
112+
} else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
113+
rest
114+
} else {
115+
bytes
116+
}
61117
} else {
62118
bytes
63119
}
64120
let builder = StringBuilder::new(size_hint=bytes.length())
65-
loop bytes {
66-
[] => ()
67-
[u16le(0xD800..=0xDBFF as higher), u16le(0xDC00..=0xDFFF as lower), .. rest] => {
68-
let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
69-
(lower.reinterpret_as_int() - 0xDC00) +
70-
0x10000
71-
if ch > 0x10FFFF {
121+
if endianness is Little {
122+
loop bytes {
123+
[] => ()
124+
[
125+
u16le(0xD800..=0xDBFF as higher),
126+
u16le(0xDC00..=0xDFFF as lower),
127+
.. rest,
128+
] => {
129+
let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
130+
(lower.reinterpret_as_int() - 0xDC00) +
131+
0x10000
132+
if ch > 0x10FFFF {
133+
builder.write_char(U_REP)
134+
} else {
135+
builder.write_char(ch.unsafe_to_char())
136+
}
137+
continue rest
138+
}
139+
[u16le(0xD800..=0xDFFF), .. rest] => {
72140
builder.write_char(U_REP)
73-
} else {
74-
builder.write_char(ch.unsafe_to_char())
141+
continue rest
75142
}
76-
continue rest
77-
}
78-
[u16le(0xD800..=0xDFFF), .. rest] => {
79-
builder.write_char(U_REP)
80-
continue rest
143+
[u16le(ch), .. rest] => {
144+
builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
145+
continue rest
146+
}
147+
_ => builder.write_char(U_REP)
81148
}
82-
[u16le(ch), .. rest] => {
83-
builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
84-
continue rest
149+
} else {
150+
loop bytes {
151+
[] => ()
152+
[
153+
u16be(0xD800..=0xDBFF as higher),
154+
u16be(0xDC00..=0xDFFF as lower),
155+
.. rest,
156+
] => {
157+
let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
158+
(lower.reinterpret_as_int() - 0xDC00) +
159+
0x10000
160+
if ch > 0x10FFFF {
161+
builder.write_char(U_REP)
162+
} else {
163+
builder.write_char(ch.unsafe_to_char())
164+
}
165+
continue rest
166+
}
167+
[u16be(0xD800..=0xDFFF), .. rest] => {
168+
builder.write_char(U_REP)
169+
continue rest
170+
}
171+
[u16be(ch), .. rest] => {
172+
builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
173+
continue rest
174+
}
175+
_ => builder.write_char(U_REP)
85176
}
86-
_ => builder.write_char(U_REP)
87177
}
88178
builder.to_string()
89179
}

encoding/utf16/decode_test.mbt

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,27 @@ test "decoding UTF16 encoded data to String" {
1818
b"\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc",
1919
)
2020
inspect(chars, content="abc你好👀")
21+
let chars_be = @utf16.decode(
22+
b"\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40",
23+
endianness=Big,
24+
)
25+
inspect(chars_be, content="abc你好👀")
2126
}
2227

2328
///|
2429
test "decoding UTF16 with bom" {
2530
let text = b"\xff\xfe\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc"
2631
inspect(try! @utf16.decode(text), content="abc你好👀")
2732
inspect(try! @utf16.decode(text, ignore_bom=true), content="abc你好👀")
33+
let text_be = b"\xfe\xff\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40"
34+
inspect(
35+
try! @utf16.decode(text_be, endianness=Big),
36+
content="abc你好👀",
37+
)
38+
inspect(
39+
try! @utf16.decode(text_be, ignore_bom=true, endianness=Big),
40+
content="abc你好👀",
41+
)
2842
}
2943

3044
///|
@@ -35,6 +49,12 @@ test "decoding UTF16 invalid data with replacement" {
3549
inspect(@utf16.decode_lossy(high_surrogate), content="�")
3650
let low_surrogate = b"\x00\xdc"
3751
inspect(@utf16.decode_lossy(low_surrogate), content="�")
52+
let unpaired_be = b"\x00\x61\x00"
53+
inspect(@utf16.decode_lossy(unpaired_be, endianness=Big), content="a�")
54+
let high_surrogate_be = b"\xd8\x00"
55+
inspect(@utf16.decode_lossy(high_surrogate_be, endianness=Big), content="�")
56+
let low_surrogate_be = b"\xdc\x00"
57+
inspect(@utf16.decode_lossy(low_surrogate_be, endianness=Big), content="�")
3858
}
3959

4060
///|
@@ -52,6 +72,19 @@ test "decoding UTF16 invalid data to String" {
5272
),
5373
)
5474
}
75+
let unpaired_be = b"\x00\x61\x00"
76+
try {
77+
let _ = @utf16.decode(unpaired_be, endianness=Big)
78+
panic()
79+
} catch {
80+
Malformed(e) =>
81+
inspect(
82+
e,
83+
content=(
84+
#|b"\x00"
85+
),
86+
)
87+
}
5588
let high_surrogate = b"\x00\xd8"
5689
try {
5790
let _ = @utf16.decode(high_surrogate)
@@ -65,6 +98,19 @@ test "decoding UTF16 invalid data to String" {
6598
),
6699
)
67100
}
101+
let high_surrogate_be = b"\xd8\x00"
102+
try {
103+
let _ = @utf16.decode(high_surrogate_be, endianness=Big)
104+
panic()
105+
} catch {
106+
Malformed(e) =>
107+
inspect(
108+
e,
109+
content=(
110+
#|b"\xd8\x00"
111+
),
112+
)
113+
}
68114
let low_surrogate = b"\x00\xdc"
69115
try {
70116
let _ = @utf16.decode(low_surrogate)
@@ -78,4 +124,17 @@ test "decoding UTF16 invalid data to String" {
78124
),
79125
)
80126
}
127+
let low_surrogate_be = b"\xdc\x00"
128+
try {
129+
let _ = @utf16.decode(low_surrogate_be, endianness=Big)
130+
panic()
131+
} catch {
132+
Malformed(e) =>
133+
inspect(
134+
e,
135+
content=(
136+
#|b"\xdc\x00"
137+
),
138+
)
139+
}
81140
}

encoding/utf16/encode.mbt

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,40 @@
1616
/// Encodes a string into a UTF-16 byte array.
1717
///
1818
/// Assuming the string is valid.
19-
pub fn encode(str : StringView, bom? : Bool = false) -> Bytes {
20-
if bom is true {
19+
pub fn encode(
20+
str : StringView,
21+
bom? : Bool = false,
22+
endianness? : Endian = Little,
23+
) -> Bytes {
24+
if endianness is Little {
25+
if bom is true {
26+
let arr = FixedArray::make(str.length() * 2 + 2, b'\x00')
27+
arr[0] = 0xFF
28+
arr[1] = 0xFE
29+
arr.blit_from_string(2, str.data(), str.start_offset(), str.length())
30+
arr.unsafe_reinterpret_as_bytes()
31+
} else {
32+
let arr = FixedArray::make(str.length() * 2, b'\x00')
33+
arr.blit_from_string(0, str.data(), str.start_offset(), str.length())
34+
arr.unsafe_reinterpret_as_bytes()
35+
}
36+
} else if bom is true {
2137
let arr = FixedArray::make(str.length() * 2 + 2, b'\x00')
22-
arr[0] = 0xFF
23-
arr[1] = 0xFE
24-
arr.blit_from_string(2, str.data(), str.start_offset(), str.length())
38+
arr[0] = 0xFE
39+
arr[1] = 0xFF
40+
for i in 0..<str.length() {
41+
let code_unit = str[i]
42+
arr[2 + i * 2] = (code_unit >> 8).to_byte()
43+
arr[2 + i * 2 + 1] = (code_unit & 0xFF).to_byte()
44+
}
2545
arr.unsafe_reinterpret_as_bytes()
2646
} else {
2747
let arr = FixedArray::make(str.length() * 2, b'\x00')
28-
arr.blit_from_string(0, str.data(), str.start_offset(), str.length())
48+
for i in 0..<str.length() {
49+
let code_unit = str[i]
50+
arr[i * 2] = (code_unit >> 8).to_byte()
51+
arr[i * 2 + 1] = (code_unit & 0xFF).to_byte()
52+
}
2953
arr.unsafe_reinterpret_as_bytes()
3054
}
3155
}

encoding/utf16/encode_test.mbt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,25 @@ test "encode" {
2222
#|b"a\x00b\x00c\x00`O}Y=\xd8@\xdc"
2323
),
2424
)
25+
let encoded_be = @utf16.encode(s, endianness=Big)
26+
inspect(
27+
encoded_be,
28+
content=(
29+
#|b"\x00a\x00b\x00cO`Y}\xd8=\xdc@"
30+
),
31+
)
2532
let encoded_with_bom = @utf16.encode(s, bom=true)
2633
inspect(
2734
encoded_with_bom,
2835
content=(
2936
#|b"\xff\xfea\x00b\x00c\x00`O}Y=\xd8@\xdc"
3037
),
3138
)
39+
let encoded_be_with_bom = @utf16.encode(s, bom=true, endianness=Big)
40+
inspect(
41+
encoded_be_with_bom,
42+
content=(
43+
#|b"\xfe\xff\x00a\x00b\x00cO`Y}\xd8=\xdc@"
44+
),
45+
)
3246
}

encoding/utf16/pkg.generated.mbti

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
// Generated using `moon info`, DON'T EDIT IT
2-
package "moonbitlang/core/encoding/utf16le"
2+
package "moonbitlang/core/encoding/utf16"
33

44
// Values
5-
fn decode(BytesView, ignore_bom? : Bool) -> String raise Malformed
5+
fn decode(BytesView, ignore_bom? : Bool, endianness? : Endian) -> String raise Malformed
66

7-
fn decode_lossy(BytesView, ignore_bom? : Bool) -> String
7+
fn decode_lossy(BytesView, ignore_bom? : Bool, endianness? : Endian) -> String
88

9-
fn encode(StringView, bom? : Bool) -> Bytes
9+
fn encode(StringView, bom? : Bool, endianness? : Endian) -> Bytes
1010

1111
// Errors
1212
pub suberror Malformed BytesView
1313

1414
// Types and methods
15+
pub(all) enum Endian {
16+
Little
17+
Big
18+
}
1519

1620
// Type aliases
1721

0 commit comments

Comments
 (0)