moonbitlang · peter-jerry-ye · Oct 28, 2025 · Sep 18, 2025 · Sep 18, 2025 · Oct 17, 2025
diff --git a/builtin/deprecated.mbt b/builtin/deprecated.mbt
@@ -219,21 +219,8 @@ pub fn Double::upto(
 }
 
 ///|
-/// Search the index of the first element that satisfies the predicate.
-///
-
-///|
-/// Creates a byte sequence from a UTF-16 encoded string. Each character in the
-/// string is encoded as a pair of bytes in little-endian order.
-///
-/// Parameters:
-///
-/// * `string` : The input string to be converted to a byte sequence.
-///
-/// Returns a new byte sequence containing the UTF-16LE encoded representation of
-/// the input string.
-///
-#deprecated("check @encoding/utf8.encode")
+#deprecated("check `@encoding/utf8.encode`")
+#coverage.skip
 pub fn Bytes::of_string(str : String) -> Bytes {
   FixedArray::make(str.length() * 2, Byte::default())
   ..blit_from_string(0, str, 0, str.length())

diff --git a/encoding/utf16/decode.mbt b/encoding/utf16/decode.mbt
@@ -0,0 +1,179 @@
+// Copyright 2025 International Digital Economy Academy
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///|
+pub suberror Malformed BytesView
+
+///|
+/// The Unicode Replacement Character, which is used to replace invalid or unrecognized sequences during lossy decoding.
+/// https://unicode.org/charts/nameslist/n_FFF0.html
+const U_REP = '\u{FFFD}'
+
+///|
+pub fn decode(
+  bytes : BytesView,
+  ignore_bom? : Bool = false,
+  endianness? : Endian = Little,
+) -> String raise Malformed {
+  let bytes = if ignore_bom {
+    if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
+      rest
+    } else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
+      rest
+    } else {
+      bytes
+    }
+  } else {
+    bytes
+  }
+  if endianness is Little {
+    // check the string
+    loop bytes {
+      [] => ()
+      [
+        u16le(0xD800..=0xDBFF as higher),
+        u16le(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] as bytes => {
+        if ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000 >
+          0x10FFFF {
+          raise Malformed(bytes)
+        }
+        continue rest
+      }
+      [u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
+      [u16le(_), .. rest] => continue rest
+      _ as bytes => raise Malformed(bytes)
+    }
+    bytes
+    .data()
+    .to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
+  } else {
+    let string_bytes = FixedArray::make(bytes.length(), b'\x00')
+    let mut i = 0
+    loop bytes {
+      [] => ()
+      [
+        u16be(0xD800..=0xDBFF as higher),
+        u16be(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] as bytes => {
+        if ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000 >
+          0x10FFFF {
+          raise Malformed(bytes)
+        }
+        string_bytes[i] = (higher & 0xFF).to_byte()
+        string_bytes[i + 1] = (higher >> 8).to_byte()
+        string_bytes[i + 2] = (lower & 0xFF).to_byte()
+        string_bytes[i + 3] = (lower >> 8).to_byte()
+        i += 4
+        continue rest
+      }
+      [u16be(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
+      [u16be(code_unit), .. rest] => {
+        string_bytes[i] = (code_unit & 0xFF).to_byte()
+        string_bytes[i + 1] = (code_unit >> 8).to_byte()
+        i += 2
+        continue rest
+      }
+      _ as bytes => raise Malformed(bytes)
+    }
+    string_bytes.unsafe_reinterpret_as_bytes().to_unchecked_string()
+  }
+}
+
+///|
+/// 
+/// References : 
+/// - https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G66453
+pub fn decode_lossy(
+  bytes : BytesView,
+  ignore_bom? : Bool = false,
+  endianness? : Endian = Little,
+) -> String {
+  let bytes = if ignore_bom {
+    if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
+      rest
+    } else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
+      rest
+    } else {
+      bytes
+    }
+  } else {
+    bytes
+  }
+  let builder = StringBuilder::new(size_hint=bytes.length())
+  if endianness is Little {
+    loop bytes {
+      [] => ()
+      [
+        u16le(0xD800..=0xDBFF as higher),
+        u16le(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] => {
+        let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000
+        if ch > 0x10FFFF {
+          builder.write_char(U_REP)
+        } else {
+          builder.write_char(ch.unsafe_to_char())
+        }
+        continue rest
+      }
+      [u16le(0xD800..=0xDFFF), .. rest] => {
+        builder.write_char(U_REP)
+        continue rest
+      }
+      [u16le(ch), .. rest] => {
+        builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
+        continue rest
+      }
+      _ => builder.write_char(U_REP)
+    }
+  } else {
+    loop bytes {
+      [] => ()
+      [
+        u16be(0xD800..=0xDBFF as higher),
+        u16be(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] => {
+        let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000
+        if ch > 0x10FFFF {
+          builder.write_char(U_REP)
+        } else {
+          builder.write_char(ch.unsafe_to_char())
+        }
+        continue rest
+      }
+      [u16be(0xD800..=0xDFFF), .. rest] => {
+        builder.write_char(U_REP)
+        continue rest
+      }
+      [u16be(ch), .. rest] => {
+        builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
+        continue rest
+      }
+      _ => builder.write_char(U_REP)
+    }
+  }
+  builder.to_string()
+}
diff --git a/encoding/utf16/decode_test.mbt b/encoding/utf16/decode_test.mbt
@@ -0,0 +1,140 @@
+// Copyright 2025 International Digital Economy Academy
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///|
+test "decoding UTF16 encoded data to String" {
+  let chars = @utf16.decode(
+    b"\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc",
+  )
+  inspect(chars, content="abc你好👀")
+  let chars_be = @utf16.decode(
+    b"\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40",
+    endianness=Big,
+  )
+  inspect(chars_be, content="abc你好👀")
+}
+
+///|
+test "decoding UTF16 with bom" {
+  let text = b"\xff\xfe\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc"
+  inspect(try! @utf16.decode(text), content="abc你好👀")
+  inspect(try! @utf16.decode(text, ignore_bom=true), content="abc你好👀")
+  let text_be = b"\xfe\xff\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40"
+  inspect(
+    try! @utf16.decode(text_be, endianness=Big),
+    content="abc你好👀",
+  )
+  inspect(
+    try! @utf16.decode(text_be, ignore_bom=true, endianness=Big),
+    content="abc你好👀",
+  )
+}
+
+///|
+test "decoding UTF16 invalid data with replacement" {
+  let unpaired = b"\x61\x00\x00"
+  inspect(@utf16.decode_lossy(unpaired), content="a�")
+  let high_surrogate = b"\x00\xd8"
+  inspect(@utf16.decode_lossy(high_surrogate), content="�")
+  let low_surrogate = b"\x00\xdc"
+  inspect(@utf16.decode_lossy(low_surrogate), content="�")
+  let unpaired_be = b"\x00\x61\x00"
+  inspect(@utf16.decode_lossy(unpaired_be, endianness=Big), content="a�")
+  let high_surrogate_be = b"\xd8\x00"
+  inspect(@utf16.decode_lossy(high_surrogate_be, endianness=Big), content="�")
+  let low_surrogate_be = b"\xdc\x00"
+  inspect(@utf16.decode_lossy(low_surrogate_be, endianness=Big), content="�")
+}
+
+///|
+test "decoding UTF16 invalid data to String" {
+  let unpaired = b"\x61\x00\x00"
+  try {
+    let _ = @utf16.decode(unpaired)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\x00"
+        ),
+      )
+  }
+  let unpaired_be = b"\x00\x61\x00"
+  try {
+    let _ = @utf16.decode(unpaired_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\x00"
+        ),
+      )
+  }
+  let high_surrogate = b"\x00\xd8"
+  try {
+    let _ = @utf16.decode(high_surrogate)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\x00\xd8"
+        ),
+      )
+  }
+  let high_surrogate_be = b"\xd8\x00"
+  try {
+    let _ = @utf16.decode(high_surrogate_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\xd8\x00"
+        ),
+      )
+  }
+  let low_surrogate = b"\x00\xdc"
+  try {
+    let _ = @utf16.decode(low_surrogate)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\x00\xdc"
+        ),
+      )
+  }
+  let low_surrogate_be = b"\xdc\x00"
+  try {
+    let _ = @utf16.decode(low_surrogate_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\xdc\x00"
+        ),
+      )
+  }
+}