Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions builtin/deprecated.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,8 @@ pub fn Double::upto(
}

///|
/// Search the index of the first element that satisfies the predicate.
///

///|
/// Creates a byte sequence from a UTF-16 encoded string. Each character in the
/// string is encoded as a pair of bytes in little-endian order.
///
/// Parameters:
///
/// * `string` : The input string to be converted to a byte sequence.
///
/// Returns a new byte sequence containing the UTF-16LE encoded representation of
/// the input string.
///
#deprecated("check @encoding/utf8.encode")
#deprecated("check `@encoding/utf8.encode`")
#coverage.skip
pub fn Bytes::of_string(str : String) -> Bytes {
FixedArray::make(str.length() * 2, Byte::default())
..blit_from_string(0, str, 0, str.length())
Expand Down
179 changes: 179 additions & 0 deletions encoding/utf16/decode.mbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// Copyright 2025 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

///|
pub suberror Malformed BytesView

///|
/// The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
/// https://unicode.org/charts/nameslist/n_FFF0.html
const U_REP = '\u{FFFD}'

///|
pub fn decode(
bytes : BytesView,
ignore_bom? : Bool = false,
endianness? : Endian = Little,
) -> String raise Malformed {
let bytes = if ignore_bom {
if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
rest
} else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
rest
} else {
bytes
}
} else {
bytes
}
if endianness is Little {
// check the string
loop bytes {
[] => ()
[
u16le(0xD800..=0xDBFF as higher),
u16le(0xDC00..=0xDFFF as lower),
.. rest,
] as bytes => {
if ((higher.reinterpret_as_int() - 0xD800) << 10) +
(lower.reinterpret_as_int() - 0xDC00) +
0x10000 >
0x10FFFF {
raise Malformed(bytes)
}
continue rest
}
[u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
[u16le(_), .. rest] => continue rest
_ as bytes => raise Malformed(bytes)
}
bytes
.data()
.to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
} else {
let string_bytes = FixedArray::make(bytes.length(), b'\x00')
let mut i = 0
loop bytes {
[] => ()
[
u16be(0xD800..=0xDBFF as higher),
u16be(0xDC00..=0xDFFF as lower),
.. rest,
] as bytes => {
if ((higher.reinterpret_as_int() - 0xD800) << 10) +
(lower.reinterpret_as_int() - 0xDC00) +
0x10000 >
0x10FFFF {
raise Malformed(bytes)
}
string_bytes[i] = (higher & 0xFF).to_byte()
string_bytes[i + 1] = (higher >> 8).to_byte()
string_bytes[i + 2] = (lower & 0xFF).to_byte()
string_bytes[i + 3] = (lower >> 8).to_byte()
i += 4
continue rest
}
[u16be(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
[u16be(code_unit), .. rest] => {
string_bytes[i] = (code_unit & 0xFF).to_byte()
string_bytes[i + 1] = (code_unit >> 8).to_byte()
i += 2
continue rest
}
_ as bytes => raise Malformed(bytes)
}
string_bytes.unsafe_reinterpret_as_bytes().to_unchecked_string()
}
}

///|
///
/// References :
/// - https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G66453
pub fn decode_lossy(
bytes : BytesView,
ignore_bom? : Bool = false,
endianness? : Endian = Little,
) -> String {
let bytes = if ignore_bom {
if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
rest
} else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
rest
} else {
bytes
}
} else {
bytes
}
let builder = StringBuilder::new(size_hint=bytes.length())
if endianness is Little {
loop bytes {
[] => ()
[
u16le(0xD800..=0xDBFF as higher),
u16le(0xDC00..=0xDFFF as lower),
.. rest,
] => {
let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
(lower.reinterpret_as_int() - 0xDC00) +
0x10000
if ch > 0x10FFFF {
builder.write_char(U_REP)
} else {
builder.write_char(ch.unsafe_to_char())
}
continue rest
}
[u16le(0xD800..=0xDFFF), .. rest] => {
builder.write_char(U_REP)
continue rest
}
[u16le(ch), .. rest] => {
builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
continue rest
}
_ => builder.write_char(U_REP)
}
} else {
loop bytes {
[] => ()
[
u16be(0xD800..=0xDBFF as higher),
u16be(0xDC00..=0xDFFF as lower),
.. rest,
] => {
let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
(lower.reinterpret_as_int() - 0xDC00) +
0x10000
if ch > 0x10FFFF {
builder.write_char(U_REP)
} else {
builder.write_char(ch.unsafe_to_char())
}
continue rest
}
[u16be(0xD800..=0xDFFF), .. rest] => {
builder.write_char(U_REP)
continue rest
}
[u16be(ch), .. rest] => {
builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
continue rest
}
_ => builder.write_char(U_REP)
}
}
builder.to_string()
}
140 changes: 140 additions & 0 deletions encoding/utf16/decode_test.mbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright 2025 International Digital Economy Academy
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

///|
test "decoding UTF16 encoded data to String" {
let chars = @utf16.decode(
b"\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc",
)
inspect(chars, content="abc你好👀")
let chars_be = @utf16.decode(
b"\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40",
endianness=Big,
)
inspect(chars_be, content="abc你好👀")
}

///|
test "decoding UTF16 with bom" {
let text = b"\xff\xfe\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc"
inspect(try! @utf16.decode(text), content="abc你好👀")
inspect(try! @utf16.decode(text, ignore_bom=true), content="abc你好👀")
let text_be = b"\xfe\xff\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40"
inspect(
try! @utf16.decode(text_be, endianness=Big),
content="abc你好👀",
)
inspect(
try! @utf16.decode(text_be, ignore_bom=true, endianness=Big),
content="abc你好👀",
)
}

///|
test "decoding UTF16 invalid data with replacement" {
let unpaired = b"\x61\x00\x00"
inspect(@utf16.decode_lossy(unpaired), content="a�")
let high_surrogate = b"\x00\xd8"
inspect(@utf16.decode_lossy(high_surrogate), content="�")
let low_surrogate = b"\x00\xdc"
inspect(@utf16.decode_lossy(low_surrogate), content="�")
let unpaired_be = b"\x00\x61\x00"
inspect(@utf16.decode_lossy(unpaired_be, endianness=Big), content="a�")
let high_surrogate_be = b"\xd8\x00"
inspect(@utf16.decode_lossy(high_surrogate_be, endianness=Big), content="�")
let low_surrogate_be = b"\xdc\x00"
inspect(@utf16.decode_lossy(low_surrogate_be, endianness=Big), content="�")
}

///|
test "decoding UTF16 invalid data to String" {
let unpaired = b"\x61\x00\x00"
try {
let _ = @utf16.decode(unpaired)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\x00"
),
)
}
let unpaired_be = b"\x00\x61\x00"
try {
let _ = @utf16.decode(unpaired_be, endianness=Big)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\x00"
),
)
}
let high_surrogate = b"\x00\xd8"
try {
let _ = @utf16.decode(high_surrogate)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\x00\xd8"
),
)
}
let high_surrogate_be = b"\xd8\x00"
try {
let _ = @utf16.decode(high_surrogate_be, endianness=Big)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\xd8\x00"
),
)
}
let low_surrogate = b"\x00\xdc"
try {
let _ = @utf16.decode(low_surrogate)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\x00\xdc"
),
)
}
let low_surrogate_be = b"\xdc\x00"
try {
let _ = @utf16.decode(low_surrogate_be, endianness=Big)
panic()
} catch {
Malformed(e) =>
inspect(
e,
content=(
#|b"\xdc\x00"
),
)
}
}
Loading