Skip to content

Commit 23776d9

Browse files
authored
Return error on invalid unicode sequences (#14666)
1 parent 74df710 commit 23776d9

File tree

3 files changed

+115
-31
lines changed

3 files changed

+115
-31
lines changed

lib/elixir/src/elixir_parser.yrl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1016,7 +1016,13 @@ build_bin_string({bin_string, Location, Args}, ExtraMeta) ->
10161016
{'<<>>', Meta, string_parts(Args)}.
10171017

10181018
build_list_string({list_string, _Location, [H]} = Token, ExtraMeta) when is_binary(H) ->
1019-
handle_literal(elixir_utils:characters_to_list(H), Token, ExtraMeta);
1019+
try
1020+
List = elixir_utils:characters_to_list(H),
1021+
handle_literal(List, Token, ExtraMeta)
1022+
catch
1023+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1024+
return_error(?location(Token), elixir_utils:characters_to_list(Message), "'")
1025+
end;
10201026
build_list_string({list_string, Location, Args}, ExtraMeta) ->
10211027
Meta = meta_from_location(Location),
10221028
MetaWithExtra =

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 78 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,55 +1023,93 @@ is_unnecessary_quote(_Parts, _Scope) ->
10231023
unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when
10241024
is_binary(Part) andalso byte_size(Part) > 255;
10251025
is_list(Part) andalso length(Part) > 255 ->
1026-
{error, {?LOC(Line, Column), "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}};
1026+
try
1027+
PartList = elixir_utils:characters_to_list(Part),
1028+
{error, {?LOC(Line, Column), "atom length must be less than system limit: ", PartList}}
1029+
catch
1030+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1031+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1032+
end;
10271033
unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder}) when
10281034
is_function(StaticAtomsEncoder) ->
1029-
Value = elixir_utils:characters_to_binary(Part),
1030-
case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
1031-
{ok, Term} ->
1032-
{ok, Term};
1033-
{error, Reason} when is_binary(Reason) ->
1034-
{error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", elixir_utils:characters_to_list(Part)}}
1035+
EncodeResult = try
1036+
ValueEncBin = elixir_utils:characters_to_binary(Part),
1037+
ValueEncList = elixir_utils:characters_to_list(Part),
1038+
{ok, ValueEncBin, ValueEncList}
1039+
catch
1040+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1041+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1042+
end,
1043+
1044+
case EncodeResult of
1045+
{ok, Value, ValueList} ->
1046+
case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
1047+
{ok, Term} ->
1048+
{ok, Term};
1049+
{error, Reason} when is_binary(Reason) ->
1050+
{error, {?LOC(Line, Column), elixir_utils:characters_to_list(Reason) ++ ": ", ValueList}}
1051+
end;
1052+
EncError -> EncError
10351053
end;
10361054
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) ->
10371055
try
10381056
{ok, binary_to_existing_atom(Binary, utf8)}
10391057
catch
10401058
error:badarg ->
10411059
% Check if it's a UTF-8 issue by trying to convert to list
1042-
elixir_utils:characters_to_list(Binary),
1043-
% If we get here, it's not a UTF-8 issue
1044-
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
1060+
try
1061+
List = elixir_utils:characters_to_list(Binary),
1062+
% If we get here, it's not a UTF-8 issue
1063+
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
1064+
catch
1065+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1066+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1067+
end
10451068
end;
10461069
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) ->
10471070
try
10481071
{ok, binary_to_atom(Binary, utf8)}
10491072
catch
10501073
error:badarg ->
10511074
% Try to convert using elixir_utils to get proper UnicodeConversionError
1052-
elixir_utils:characters_to_list(Binary),
1053-
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1054-
{error, {?LOC(Line, Column), "invalid atom: ", elixir_utils:characters_to_list(Binary)}}
1075+
try
1076+
List = elixir_utils:characters_to_list(Binary),
1077+
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1078+
{error, {?LOC(Line, Column), "invalid atom: ", List}}
1079+
catch
1080+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1081+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1082+
end
10551083
end;
10561084
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) ->
10571085
try
10581086
{ok, list_to_existing_atom(List)}
10591087
catch
10601088
error:badarg ->
10611089
% Try to convert using elixir_utils to get proper UnicodeConversionError
1062-
elixir_utils:characters_to_binary(List),
1063-
% If we get here, it's not a UTF-8 issue
1064-
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
1090+
try
1091+
elixir_utils:characters_to_binary(List),
1092+
% If we get here, it's not a UTF-8 issue
1093+
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
1094+
catch
1095+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1096+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1097+
end
10651098
end;
10661099
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) ->
10671100
try
10681101
{ok, list_to_atom(List)}
10691102
catch
10701103
error:badarg ->
10711104
% Try to convert using elixir_utils to get proper UnicodeConversionError
1072-
elixir_utils:characters_to_binary(List),
1073-
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1074-
{error, {?LOC(Line, Column), "invalid atom: ", List}}
1105+
try
1106+
elixir_utils:characters_to_binary(List),
1107+
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1108+
{error, {?LOC(Line, Column), "invalid atom: ", List}}
1109+
catch
1110+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1111+
{error, {?LOC(Line, Column), "invalid encoding in atom: ", elixir_utils:characters_to_list(Message)}}
1112+
end
10751113
end.
10761114

10771115
collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) ->
@@ -1095,7 +1133,12 @@ extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) ->
10951133
{Parts1, {ShouldWarn, _}} = lists:mapfoldl(Fun, {false, Line}, Parts0),
10961134
Parts2 = extract_heredoc_head(Parts1),
10971135
NewScope = maybe_heredoc_warn(ShouldWarn, Column, InterScope, H),
1098-
{ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope};
1136+
try
1137+
{ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope}
1138+
catch
1139+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1140+
{error, interpolation_format(Message, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
1141+
end;
10991142

11001143
{error, Reason} ->
11011144
{error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line], Line, Column, [H, H, H], [H, H, H])}
@@ -1166,8 +1209,13 @@ unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=true}) ->
11661209
{error, Message, Token} ->
11671210
{error, {?LOC(Line, Column), Message ++ ". Syntax error after: ", Token}}
11681211
end;
1169-
unescape_tokens(Tokens, _Line, _Column, #elixir_tokenizer{unescape=false}) ->
1170-
{ok, tokens_to_binary(Tokens)}.
1212+
unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=false}) ->
1213+
try
1214+
{ok, tokens_to_binary(Tokens)}
1215+
catch
1216+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1217+
{error, {?LOC(Line, Column), "invalid encoding in tokens: ", elixir_utils:characters_to_list(Message)}}
1218+
end.
11711219

11721220
tokens_to_binary(Tokens) ->
11731221
[if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end
@@ -1671,7 +1719,14 @@ tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, S
16711719
case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of
16721720
{NewLine, NewColumn, Parts, Rest, NewScope} ->
16731721
Indentation = nil,
1674-
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>);
1722+
try
1723+
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>)
1724+
catch
1725+
error:#{'__struct__' := 'Elixir.UnicodeConversionError', message := Message} ->
1726+
Sigil = [$~, S, H],
1727+
Message = " (for sigil ~ts starting at line ~B)",
1728+
interpolation_error(Message, [$~] ++ SigilName ++ Original, Scope, Tokens, Message, [Sigil, Line], Line, Column, [H], [sigil_terminator(H)])
1729+
end;
16751730

16761731
{error, Reason} ->
16771732
Sigil = [$~, S, H],

lib/elixir/test/elixir/code_test.exs

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -556,24 +556,47 @@ defmodule CodeTest do
556556
assert token4 == "\\u"
557557
end
558558

559-
test "string_to_quoted raises UnicodeConversionError for invalid UTF-8 in quoted atoms and function calls" do
559+
test "string_to_quoted returns error for invalid UTF-8 in strings" do
560560
invalid_utf8_cases = [
561+
# charlist
562+
"'\\xFF'",
563+
# charlist heredoc
564+
"'''\n\\xFF\\\n'''"
565+
]
566+
567+
for code <- invalid_utf8_cases do
568+
assert {:error, {_, message, _}} = Code.string_to_quoted(code)
569+
assert message =~ "invalid encoding starting at <<255>>"
570+
end
571+
end
572+
573+
test "string_to_quoted returns error for invalid UTF-8 in quoted atoms and function calls" do
574+
invalid_utf8_cases = [
575+
# charlist
576+
# ~S{'\xFF'},
577+
# charlist heredoc
578+
# ~s{'''\n\xFF\n'''},
561579
# Quoted atom
562580
~S{:"\xFF"},
563581
~S{:'\xFF'},
582+
# Quoted keyword identifier
583+
~S{["\xFF": 1]},
584+
~S{['\xFF': 1]},
564585
# Quoted function call
565586
~S{foo."\xFF"()},
566587
~S{foo.'\xFF'()}
567588
]
568589

569590
for code <- invalid_utf8_cases do
570-
assert_raise UnicodeConversionError, fn ->
571-
Code.string_to_quoted!(code)
572-
end
591+
assert {:error, {_, message, detail}} = Code.string_to_quoted(code)
592+
assert message =~ "invalid encoding in atom: "
593+
assert detail =~ "invalid encoding starting at <<255>>"
573594

574-
assert_raise UnicodeConversionError, fn ->
575-
Code.string_to_quoted!(code, existing_atoms_only: true)
576-
end
595+
assert {:error, {_, message, detail}} =
596+
Code.string_to_quoted(code, existing_atoms_only: true)
597+
598+
assert message =~ "invalid encoding in atom: "
599+
assert detail =~ "invalid encoding starting at <<255>>"
577600
end
578601
end
579602

0 commit comments

Comments
 (0)