Return error on invalid unicode sequences (#14666)

lukaszsamson · web-flow · commit 23776d9e8f8c · 2025-07-27T19:04:14.000+02:00
diff --git a/lib/elixir/src/elixir_parser.yrl b/lib/elixir/src/elixir_parser.yrl
@@ -1016,7 +1016,13 @@ build_bin_string({bin_string, Location, Args}, ExtraMeta) ->
   {'<<>>', Meta, string_parts(Args)}.
 
 build_list_string({list_string, _Location, [H]} = Token, ExtraMeta) when is_binary(H) ->
-  handle_literal(elixir_utils:characters_to_list(H), Token, ExtraMeta);
+  try
+    List = elixir_utils:characters_to_list(H),
+    handle_literal(List, Token, ExtraMeta)
+  catch
+    error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+      return_error(?location(Token), elixir_utils:characters_to_list(Message), "'")
+  end;
 build_list_string({list_string, Location, Args}, ExtraMeta) ->
   Meta = meta_from_location(Location),
   MetaWithExtra =
diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl
@@ -1023,55 +1023,93 @@ is_unnecessary_quote(_Parts, _Scope) ->
 unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when
     is_binary(Part) andalso byte_size(Part) > 255;
     is_list(Part) andalso length(Part) > 255 ->
-  {error, {?LOC(Line, Column), "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}};
+  try
+    PartList = elixir_utils:characters_to_list(Part),
+    {error, {?LOC(Line, Column), "atom length must be less than system limit: ", PartList}}
+  catch
+    error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+      {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+  end;
 unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder}) when
     is_function(StaticAtomsEncoder) ->
-  Value = elixir_utils:characters_to_binary(Part),
-  case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
-    {ok, Term} ->
-      {ok, Term};
-    {error, Reason} when is_binary(Reason) ->
-      {error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", elixir_utils:characters_to_list(Part)}}
+  EncodeResult = try
+    ValueEncBin = elixir_utils:characters_to_binary(Part),
+    ValueEncList = elixir_utils:characters_to_list(Part),
+    {ok, ValueEncBin, ValueEncList}
+  catch
+    error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+      {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+  end,
+
+  case EncodeResult of
+    {ok, Value, ValueList} ->
+      case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
+        {ok, Term} ->
+          {ok, Term};
+        {error, Reason} when is_binary(Reason) ->
+          {error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", ValueList}}
+      end;
+    EncError -> EncError
   end;
 unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) ->
   try
     {ok, binary_to_existing_atom(Binary, utf8)}
   catch
     error:badarg -> 
       % Check if it's a UTF-8 issue by trying to convert to list
-      elixir_utils:characters_to_list(Binary),
-      % If we get here, it's not a UTF-8 issue
-      {error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
+      try
+        List = elixir_utils:characters_to_list(Binary),
+        % If we get here, it's not a UTF-8 issue
+        {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
+      catch
+        error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+          {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+      end
   end;
 unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) ->
   try
     {ok, binary_to_atom(Binary, utf8)}
   catch
     error:badarg ->
       % Try to convert using elixir_utils to get proper UnicodeConversionError
-      elixir_utils:characters_to_list(Binary),
-      % If we get here, it's not a UTF-8 issue, so it's some other badarg
-      {error, {?LOC(Line, Column), "invalid atom: ", elixir_utils:characters_to_list(Binary)}}
+      try
+        List = elixir_utils:characters_to_list(Binary),
+        % If we get here, it's not a UTF-8 issue, so it's some other badarg
+        {error, {?LOC(Line, Column), "invalid atom: ", List}}
+      catch
+        error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+          {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+      end
   end;
 unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) ->
   try
     {ok, list_to_existing_atom(List)}
   catch
     error:badarg ->
       % Try to convert using elixir_utils to get proper UnicodeConversionError
-      elixir_utils:characters_to_binary(List),
-      % If we get here, it's not a UTF-8 issue
-      {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
+      try
+        elixir_utils:characters_to_binary(List),
+        % If we get here, it's not a UTF-8 issue
+        {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
+      catch
+        error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+          {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+      end
   end;
 unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) ->
   try
     {ok, list_to_atom(List)}
   catch
     error:badarg ->
       % Try to convert using elixir_utils to get proper UnicodeConversionError
-      elixir_utils:characters_to_binary(List),
-      % If we get here, it's not a UTF-8 issue, so it's some other badarg
-      {error, {?LOC(Line, Column), "invalid atom: ", List}}
+      try
+        elixir_utils:characters_to_binary(List),
+        % If we get here, it's not a UTF-8 issue, so it's some other badarg
+        {error, {?LOC(Line, Column), "invalid atom: ", List}}
+      catch
+        error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+          {error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
+      end
   end.
 
 collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) ->
@@ -1095,7 +1133,12 @@ extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) ->
           {Parts1, {ShouldWarn, _}} = lists:mapfoldl(Fun, {false, Line}, Parts0),
           Parts2 = extract_heredoc_head(Parts1),
           NewScope = maybe_heredoc_warn(ShouldWarn, Column, InterScope, H),
-          {ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope};
+          try
+            {ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope}
+          catch
+            error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+              {error, interpolation_format(Message, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
+          end;
 
         {error, Reason} ->
           {error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
@@ -1166,8 +1209,13 @@ unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=true}) ->
     {error, Message, Token} ->
       {error, {?LOC(Line, Column), Message ++ ". Syntax error after: ", Token}}
   end;
-unescape_tokens(Tokens, _Line, _Column, #elixir_tokenizer{unescape=false}) ->
-  {ok, tokens_to_binary(Tokens)}.
+unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=false}) ->
+  try
+    {ok, tokens_to_binary(Tokens)}
+  catch
+    error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+      {error, {?LOC(Line, Column), "invalid encoding in tokens: ", elixir_utils:characters_to_list(Message)}}
+  end.
 
 tokens_to_binary(Tokens) ->
   [if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end
@@ -1671,7 +1719,14 @@ tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, S
   case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of
     {NewLine, NewColumn, Parts, Rest, NewScope} ->
       Indentation = nil,
-      add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>);
+      try
+        add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>)
+      catch
+        error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
+          Sigil = [$~, S, H],
+          Message = " (for sigil ~ts starting at line ~B)",
+          interpolation_error(Message, [$~] ++ SigilName ++ Original, Scope, Tokens, Message, [Sigil, Line], Line, Column, [H], [sigil_terminator(H)])
+      end;
 
     {error, Reason} ->
       Sigil = [$~, S, H],
diff --git a/lib/elixir/test/elixir/code_test.exs b/lib/elixir/test/elixir/code_test.exs
@@ -556,24 +556,47 @@ defmodule CodeTest do
     assert token4 == "\\u"
   end
 
-  test "string_to_quoted raises UnicodeConversionError for invalid UTF-8 in quoted atoms and function calls" do
+  test "string_to_quoted returns error for invalid UTF-8 in strings" do
     invalid_utf8_cases = [
+      # charlist
+      "'\\xFF'",
+      # charlist heredoc
+      "'''\n\\xFF\\\n'''"
+    ]
+
+    for code <- invalid_utf8_cases do
+      assert {:error, {_, message, _}} = Code.string_to_quoted(code)
+      assert message =~ "invalid encoding starting at <<255>>"
+    end
+  end
+
+  test "string_to_quoted returns error for invalid UTF-8 in quoted atoms and function calls" do
+    invalid_utf8_cases = [
+      # charlist
+      # ~S{'\xFF'},
+      # charlist heredoc
+      # ~s{'''\n\xFF\n'''},
       # Quoted atom
       ~S{:"\xFF"},
       ~S{:'\xFF'},
+      # Quoted keyword identifier
+      ~S{["\xFF": 1]},
+      ~S{['\xFF': 1]},
       # Quoted function call
       ~S{foo."\xFF"()},
       ~S{foo.'\xFF'()}
     ]
 
     for code <- invalid_utf8_cases do
-      assert_raise UnicodeConversionError, fn ->
-        Code.string_to_quoted!(code)
-      end
+      assert {:error, {_, message, detail}} = Code.string_to_quoted(code)
+      assert message =~ "invalid encoding in atom: "
+      assert detail =~ "invalid encoding starting at <<255>>"
 
-      assert_raise UnicodeConversionError, fn ->
-        Code.string_to_quoted!(code, existing_atoms_only: true)
-      end
+      assert {:error, {_, message, detail}} =
+               Code.string_to_quoted(code, existing_atoms_only: true)
+
+      assert message =~ "invalid encoding in atom: "
+      assert detail =~ "invalid encoding starting at <<255>>"
     end
   end