feat: add utf16be support

peter-jerry-ye · peter-jerry-ye · commit 37a44a649c02 · 2025-10-20T15:19:05.000+08:00
diff --git a/encoding/utf16/decode.mbt b/encoding/utf16/decode.mbt
@@ -24,66 +24,156 @@ const U_REP = '\u{FFFD}'
 pub fn decode(
   bytes : BytesView,
   ignore_bom? : Bool = false,
+  endianness? : Endian = Little,
 ) -> String raise Malformed {
-  let bytes = if ignore_bom && bytes is [.. "\xff\xfe", .. rest] {
-    rest
+  let bytes = if ignore_bom {
+    if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
+      rest
+    } else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
+      rest
+    } else {
+      bytes
+    }
   } else {
     bytes
   }
-  // check the string
-  loop bytes {
-    [] => ()
-    [u16le(0xD800..=0xDBFF as higher), u16le(0xDC00..=0xDFFF as lower), .. rest] as bytes => {
-      if ((higher.reinterpret_as_int() - 0xD800) << 10) +
-        (lower.reinterpret_as_int() - 0xDC00) +
-        0x10000 >
-        0x10FFFF {
-        raise Malformed(bytes)
+  if endianness is Little {
+    // check the string
+    loop bytes {
+      [] => ()
+      [
+        u16le(0xD800..=0xDBFF as higher),
+        u16le(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] as bytes => {
+        if ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000 >
+          0x10FFFF {
+          raise Malformed(bytes)
+        }
+        continue rest
       }
-      continue rest
+      [u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
+      [u16le(_), .. rest] => continue rest
+      _ as bytes => raise Malformed(bytes)
     }
-    [u16le(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
-    [u16le(_), .. rest] => continue rest
-    _ as bytes => raise Malformed(bytes)
+    bytes
+    .data()
+    .to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
+  } else {
+    let string_bytes = FixedArray::make(bytes.length(), b'\x00')
+    let mut i = 0
+    loop bytes {
+      [] => ()
+      [
+        u16be(0xD800..=0xDBFF as higher),
+        u16be(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] as bytes => {
+        if ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000 >
+          0x10FFFF {
+          raise Malformed(bytes)
+        }
+        string_bytes[i] = (higher & 0xFF).to_byte()
+        string_bytes[i + 1] = (higher >> 8).to_byte()
+        string_bytes[i + 2] = (lower & 0xFF).to_byte()
+        string_bytes[i + 3] = (lower >> 8).to_byte()
+        i += 4
+        continue rest
+      }
+      [u16be(0xD800..=0xDFFF), ..] as bytes => raise Malformed(bytes)
+      [u16be(code_unit), .. rest] => {
+        string_bytes[i] = (code_unit & 0xFF).to_byte()
+        string_bytes[i + 1] = (code_unit >> 8).to_byte()
+        i += 2
+        continue rest
+      }
+      _ as bytes => raise Malformed(bytes)
+    }
+    string_bytes.unsafe_reinterpret_as_bytes().to_unchecked_string()
   }
-  bytes
-  .data()
-  .to_unchecked_string(offset=bytes.start_offset(), length=bytes.length())
 }
 
 ///|
 /// 
 /// References : 
 /// - https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G66453
-pub fn decode_lossy(bytes : BytesView, ignore_bom? : Bool = false) -> String {
-  let bytes = if ignore_bom && bytes is [.. "\xff\xfe", .. rest] {
-    rest
+pub fn decode_lossy(
+  bytes : BytesView,
+  ignore_bom? : Bool = false,
+  endianness? : Endian = Little,
+) -> String {
+  let bytes = if ignore_bom {
+    if endianness is Little && bytes is [.. "\xff\xfe", .. rest] {
+      rest
+    } else if endianness is Big && bytes is [.. "\xfe\xff", .. rest] {
+      rest
+    } else {
+      bytes
+    }
   } else {
     bytes
   }
   let builder = StringBuilder::new(size_hint=bytes.length())
-  loop bytes {
-    [] => ()
-    [u16le(0xD800..=0xDBFF as higher), u16le(0xDC00..=0xDFFF as lower), .. rest] => {
-      let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
-        (lower.reinterpret_as_int() - 0xDC00) +
-        0x10000
-      if ch > 0x10FFFF {
+  if endianness is Little {
+    loop bytes {
+      [] => ()
+      [
+        u16le(0xD800..=0xDBFF as higher),
+        u16le(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] => {
+        let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000
+        if ch > 0x10FFFF {
+          builder.write_char(U_REP)
+        } else {
+          builder.write_char(ch.unsafe_to_char())
+        }
+        continue rest
+      }
+      [u16le(0xD800..=0xDFFF), .. rest] => {
         builder.write_char(U_REP)
-      } else {
-        builder.write_char(ch.unsafe_to_char())
+        continue rest
       }
-      continue rest
-    }
-    [u16le(0xD800..=0xDFFF), .. rest] => {
-      builder.write_char(U_REP)
-      continue rest
+      [u16le(ch), .. rest] => {
+        builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
+        continue rest
+      }
+      _ => builder.write_char(U_REP)
     }
-    [u16le(ch), .. rest] => {
-      builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
-      continue rest
+  } else {
+    loop bytes {
+      [] => ()
+      [
+        u16be(0xD800..=0xDBFF as higher),
+        u16be(0xDC00..=0xDFFF as lower),
+        .. rest,
+      ] => {
+        let ch = ((higher.reinterpret_as_int() - 0xD800) << 10) +
+          (lower.reinterpret_as_int() - 0xDC00) +
+          0x10000
+        if ch > 0x10FFFF {
+          builder.write_char(U_REP)
+        } else {
+          builder.write_char(ch.unsafe_to_char())
+        }
+        continue rest
+      }
+      [u16be(0xD800..=0xDFFF), .. rest] => {
+        builder.write_char(U_REP)
+        continue rest
+      }
+      [u16be(ch), .. rest] => {
+        builder.write_char(ch.reinterpret_as_int().unsafe_to_char())
+        continue rest
+      }
+      _ => builder.write_char(U_REP)
     }
-    _ => builder.write_char(U_REP)
   }
   builder.to_string()
 }
diff --git a/encoding/utf16/decode_test.mbt b/encoding/utf16/decode_test.mbt
@@ -18,13 +18,27 @@ test "decoding UTF16 encoded data to String" {
     b"\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc",
   )
   inspect(chars, content="abc你好👀")
+  let chars_be = @utf16.decode(
+    b"\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40",
+    endianness=Big,
+  )
+  inspect(chars_be, content="abc你好👀")
 }
 
 ///|
 test "decoding UTF16 with bom" {
   let text = b"\xff\xfe\x61\x00\x62\x00\x63\x00\x60\x4f\x7d\x59\x3d\xd8\x40\xdc"
   inspect(try! @utf16.decode(text), content="﻿abc你好👀")
   inspect(try! @utf16.decode(text, ignore_bom=true), content="abc你好👀")
+  let text_be = b"\xfe\xff\x00\x61\x00\x62\x00\x63\x4f\x60\x59\x7d\xd8\x3d\xdc\x40"
+  inspect(
+    try! @utf16.decode(text_be, endianness=Big),
+    content="﻿abc你好👀",
+  )
+  inspect(
+    try! @utf16.decode(text_be, ignore_bom=true, endianness=Big),
+    content="abc你好👀",
+  )
 }
 
 ///|
@@ -35,6 +49,12 @@ test "decoding UTF16 invalid data with replacement" {
   inspect(@utf16.decode_lossy(high_surrogate), content="�")
   let low_surrogate = b"\x00\xdc"
   inspect(@utf16.decode_lossy(low_surrogate), content="�")
+  let unpaired_be = b"\x00\x61\x00"
+  inspect(@utf16.decode_lossy(unpaired_be, endianness=Big), content="a�")
+  let high_surrogate_be = b"\xd8\x00"
+  inspect(@utf16.decode_lossy(high_surrogate_be, endianness=Big), content="�")
+  let low_surrogate_be = b"\xdc\x00"
+  inspect(@utf16.decode_lossy(low_surrogate_be, endianness=Big), content="�")
 }
 
 ///|
@@ -52,6 +72,19 @@ test "decoding UTF16 invalid data to String" {
         ),
       )
   }
+  let unpaired_be = b"\x00\x61\x00"
+  try {
+    let _ = @utf16.decode(unpaired_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\x00"
+        ),
+      )
+  }
   let high_surrogate = b"\x00\xd8"
   try {
     let _ = @utf16.decode(high_surrogate)
@@ -65,6 +98,19 @@ test "decoding UTF16 invalid data to String" {
         ),
       )
   }
+  let high_surrogate_be = b"\xd8\x00"
+  try {
+    let _ = @utf16.decode(high_surrogate_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\xd8\x00"
+        ),
+      )
+  }
   let low_surrogate = b"\x00\xdc"
   try {
     let _ = @utf16.decode(low_surrogate)
@@ -78,4 +124,17 @@ test "decoding UTF16 invalid data to String" {
         ),
       )
   }
+  let low_surrogate_be = b"\xdc\x00"
+  try {
+    let _ = @utf16.decode(low_surrogate_be, endianness=Big)
+    panic()
+  } catch {
+    Malformed(e) =>
+      inspect(
+        e,
+        content=(
+          #|b"\xdc\x00"
+        ),
+      )
+  }
 }
diff --git a/encoding/utf16/encode.mbt b/encoding/utf16/encode.mbt
@@ -16,16 +16,40 @@
 /// Encodes a string into a UTF-16 byte array.
 /// 
 /// Assuming the string is valid.
-pub fn encode(str : StringView, bom? : Bool = false) -> Bytes {
-  if bom is true {
+pub fn encode(
+  str : StringView,
+  bom? : Bool = false,
+  endianness? : Endian = Little,
+) -> Bytes {
+  if endianness is Little {
+    if bom is true {
+      let arr = FixedArray::make(str.length() * 2 + 2, b'\x00')
+      arr[0] = 0xFF
+      arr[1] = 0xFE
+      arr.blit_from_string(2, str.data(), str.start_offset(), str.length())
+      arr.unsafe_reinterpret_as_bytes()
+    } else {
+      let arr = FixedArray::make(str.length() * 2, b'\x00')
+      arr.blit_from_string(0, str.data(), str.start_offset(), str.length())
+      arr.unsafe_reinterpret_as_bytes()
+    }
+  } else if bom is true {
     let arr = FixedArray::make(str.length() * 2 + 2, b'\x00')
-    arr[0] = 0xFF
-    arr[1] = 0xFE
-    arr.blit_from_string(2, str.data(), str.start_offset(), str.length())
+    arr[0] = 0xFE
+    arr[1] = 0xFF
+    for i in 0..<str.length() {
+      let code_unit = str[i]
+      arr[2 + i * 2] = (code_unit >> 8).to_byte()
+      arr[2 + i * 2 + 1] = (code_unit & 0xFF).to_byte()
+    }
     arr.unsafe_reinterpret_as_bytes()
   } else {
     let arr = FixedArray::make(str.length() * 2, b'\x00')
-    arr.blit_from_string(0, str.data(), str.start_offset(), str.length())
+    for i in 0..<str.length() {
+      let code_unit = str[i]
+      arr[i * 2] = (code_unit >> 8).to_byte()
+      arr[i * 2 + 1] = (code_unit & 0xFF).to_byte()
+    }
     arr.unsafe_reinterpret_as_bytes()
   }
 }
diff --git a/encoding/utf16/encode_test.mbt b/encoding/utf16/encode_test.mbt
@@ -22,11 +22,25 @@ test "encode" {
       #|b"a\x00b\x00c\x00`O}Y=\xd8@\xdc"
     ),
   )
+  let encoded_be = @utf16.encode(s, endianness=Big)
+  inspect(
+    encoded_be,
+    content=(
+      #|b"\x00a\x00b\x00cO`Y}\xd8=\xdc@"
+    ),
+  )
   let encoded_with_bom = @utf16.encode(s, bom=true)
   inspect(
     encoded_with_bom,
     content=(
       #|b"\xff\xfea\x00b\x00c\x00`O}Y=\xd8@\xdc"
     ),
   )
+  let encoded_be_with_bom = @utf16.encode(s, bom=true, endianness=Big)
+  inspect(
+    encoded_be_with_bom,
+    content=(
+      #|b"\xfe\xff\x00a\x00b\x00cO`Y}\xd8=\xdc@"
+    ),
+  )
 }
diff --git a/encoding/utf16/pkg.generated.mbti b/encoding/utf16/pkg.generated.mbti
@@ -1,17 +1,21 @@
 // Generated using `moon info`, DON'T EDIT IT
-package "moonbitlang/core/encoding/utf16le"
+package "moonbitlang/core/encoding/utf16"
 
 // Values
-fn decode(BytesView, ignore_bom? : Bool) -> String raise Malformed
+fn decode(BytesView, ignore_bom? : Bool, endianness? : Endian) -> String raise Malformed
 
-fn decode_lossy(BytesView, ignore_bom? : Bool) -> String
+fn decode_lossy(BytesView, ignore_bom? : Bool, endianness? : Endian) -> String
 
-fn encode(StringView, bom? : Bool) -> Bytes
+fn encode(StringView, bom? : Bool, endianness? : Endian) -> Bytes
 
 // Errors
 pub suberror Malformed BytesView
 
 // Types and methods
+pub(all) enum Endian {
+  Little
+  Big
+}
 
 // Type aliases
 
diff --git a/encoding/utf16/types.mbt b/encoding/utf16/types.mbt