Skip to content

Return error on invalid unicode sequences #14666

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion lib/elixir/src/elixir_parser.yrl
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,13 @@ build_bin_string({bin_string, Location, Args}, ExtraMeta) ->
{'<<>>', Meta, string_parts(Args)}.

build_list_string({list_string, _Location, [H]} = Token, ExtraMeta) when is_binary(H) ->
handle_literal(elixir_utils:characters_to_list(H), Token, ExtraMeta);
try
List = elixir_utils:characters_to_list(H),
handle_literal(List, Token, ExtraMeta)
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
return_error(?location(Token), elixir_utils:characters_to_list(Message), "'")
end;
build_list_string({list_string, Location, Args}, ExtraMeta) ->
Meta = meta_from_location(Location),
MetaWithExtra =
Expand Down
101 changes: 78 additions & 23 deletions lib/elixir/src/elixir_tokenizer.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1023,55 +1023,93 @@ is_unnecessary_quote(_Parts, _Scope) ->
unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when
is_binary(Part) andalso byte_size(Part) > 255;
is_list(Part) andalso length(Part) > 255 ->
{error, {?LOC(Line, Column), "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}};
try
PartList = elixir_utils:characters_to_list(Part),
{error, {?LOC(Line, Column), "atom length must be less than system limit: ", PartList}}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end;
unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder}) when
is_function(StaticAtomsEncoder) ->
Value = elixir_utils:characters_to_binary(Part),
case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
{ok, Term} ->
{ok, Term};
{error, Reason} when is_binary(Reason) ->
{error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", elixir_utils:characters_to_list(Part)}}
EncodeResult = try
ValueEncBin = elixir_utils:characters_to_binary(Part),
ValueEncList = elixir_utils:characters_to_list(Part),
{ok, ValueEncBin, ValueEncList}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end,

case EncodeResult of
{ok, Value, ValueList} ->
case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
{ok, Term} ->
{ok, Term};
{error, Reason} when is_binary(Reason) ->
{error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", ValueList}}
end;
EncError -> EncError
end;
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) ->
try
{ok, binary_to_existing_atom(Binary, utf8)}
catch
error:badarg ->
% Check if it's a UTF-8 issue by trying to convert to list
elixir_utils:characters_to_list(Binary),
% If we get here, it's not a UTF-8 issue
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
try
List = elixir_utils:characters_to_list(Binary),
% If we get here, it's not a UTF-8 issue
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end
end;
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) ->
try
{ok, binary_to_atom(Binary, utf8)}
catch
error:badarg ->
% Try to convert using elixir_utils to get proper UnicodeConversionError
elixir_utils:characters_to_list(Binary),
% If we get here, it's not a UTF-8 issue, so it's some other badarg
{error, {?LOC(Line, Column), "invalid atom: ", elixir_utils:characters_to_list(Binary)}}
try
List = elixir_utils:characters_to_list(Binary),
% If we get here, it's not a UTF-8 issue, so it's some other badarg
{error, {?LOC(Line, Column), "invalid atom: ", List}}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end
end;
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) ->
try
{ok, list_to_existing_atom(List)}
catch
error:badarg ->
% Try to convert using elixir_utils to get proper UnicodeConversionError
elixir_utils:characters_to_binary(List),
% If we get here, it's not a UTF-8 issue
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
try
elixir_utils:characters_to_binary(List),
% If we get here, it's not a UTF-8 issue
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end
end;
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) ->
try
{ok, list_to_atom(List)}
catch
error:badarg ->
% Try to convert using elixir_utils to get proper UnicodeConversionError
elixir_utils:characters_to_binary(List),
% If we get here, it's not a UTF-8 issue, so it's some other badarg
{error, {?LOC(Line, Column), "invalid atom: ", List}}
try
elixir_utils:characters_to_binary(List),
% If we get here, it's not a UTF-8 issue, so it's some other badarg
{error, {?LOC(Line, Column), "invalid atom: ", List}}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
end
end.

collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) ->
Expand All @@ -1095,7 +1133,12 @@ extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) ->
{Parts1, {ShouldWarn, _}} = lists:mapfoldl(Fun, {false, Line}, Parts0),
Parts2 = extract_heredoc_head(Parts1),
NewScope = maybe_heredoc_warn(ShouldWarn, Column, InterScope, H),
{ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope};
try
{ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, interpolation_format(Message, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
end;

{error, Reason} ->
{error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
Expand Down Expand Up @@ -1166,8 +1209,13 @@ unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=true}) ->
{error, Message, Token} ->
{error, {?LOC(Line, Column), Message ++ ". Syntax error after: ", Token}}
end;
unescape_tokens(Tokens, _Line, _Column, #elixir_tokenizer{unescape=false}) ->
{ok, tokens_to_binary(Tokens)}.
unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=false}) ->
try
{ok, tokens_to_binary(Tokens)}
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
{error, {?LOC(Line, Column), "invalid encoding in tokens: ", elixir_utils:characters_to_list(Message)}}
end.

tokens_to_binary(Tokens) ->
[if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end
Expand Down Expand Up @@ -1671,7 +1719,14 @@ tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, S
case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of
{NewLine, NewColumn, Parts, Rest, NewScope} ->
Indentation = nil,
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>);
try
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>)
catch
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
Sigil = [$~, S, H],
Message = " (for sigil ~ts starting at line ~B)",
interpolation_error(Message, [$~] ++ SigilName ++ Original, Scope, Tokens, Message, [Sigil, Line], Line, Column, [H], [sigil_terminator(H)])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't there be a test added for this change? Dialyzer says this function call will fail, the first argument of interpolation_error should be a 5 element tuple

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, I will revert it for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Especially because this is wrapping a large chunk of code instead of a small one, similar to others.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try catch is not needed here. It calls tokens_to_binary which may throw on liststring but it will only get binary args here

end;

{error, Reason} ->
Sigil = [$~, S, H],
Expand Down
37 changes: 30 additions & 7 deletions lib/elixir/test/elixir/code_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -556,24 +556,47 @@ defmodule CodeTest do
assert token4 == "\\u"
end

test "string_to_quoted raises UnicodeConversionError for invalid UTF-8 in quoted atoms and function calls" do
test "string_to_quoted returns error for invalid UTF-8 in strings" do
invalid_utf8_cases = [
# charlist
"'\\xFF'",
# charlist heredoc
"'''\n\\xFF\\\n'''"
]

for code <- invalid_utf8_cases do
assert {:error, {_, message, _}} = Code.string_to_quoted(code)
assert message =~ "invalid encoding starting at <<255>>"
end
end

test "string_to_quoted returns error for invalid UTF-8 in quoted atoms and function calls" do
invalid_utf8_cases = [
# charlist
# ~S{'\xFF'},
# charlist heredoc
# ~s{'''\n\xFF\n'''},
# Quoted atom
~S{:"\xFF"},
~S{:'\xFF'},
# Quoted keyword identifier
~S{["\xFF": 1]},
~S{['\xFF': 1]},
# Quoted function call
~S{foo."\xFF"()},
~S{foo.'\xFF'()}
]

for code <- invalid_utf8_cases do
assert_raise UnicodeConversionError, fn ->
Code.string_to_quoted!(code)
end
assert {:error, {_, message, detail}} = Code.string_to_quoted(code)
assert message =~ "invalid encoding in atom: "
assert detail =~ "invalid encoding starting at <<255>>"

assert_raise UnicodeConversionError, fn ->
Code.string_to_quoted!(code, existing_atoms_only: true)
end
assert {:error, {_, message, detail}} =
Code.string_to_quoted(code, existing_atoms_only: true)

assert message =~ "invalid encoding in atom: "
assert detail =~ "invalid encoding starting at <<255>>"
end
end

Expand Down
Loading