Skip to content

Commit 82bce4c

Browse files
committed
Invoke as_json callback for strings with invalid encoding
Fix: #873 This allow users to encode binary strings if they so wish. e.g. they can use Base64 or similar, or chose to replace invalid characters with something else.
1 parent a630388 commit 82bce4c

File tree

2 files changed

+178
-96
lines changed

2 files changed

+178
-96
lines changed

ext/json/ext/generator/generator.c

Lines changed: 146 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ RBIMPL_ATTR_NORETURN()
9090
#endif
9191
static void raise_generator_error_str(VALUE invalid_object, VALUE str)
9292
{
93+
rb_enc_associate_index(str, utf8_encindex);
9394
VALUE exc = rb_exc_new_str(eGeneratorError, str);
9495
rb_ivar_set(exc, rb_intern("@invalid_object"), invalid_object);
9596
rb_exc_raise(exc);
@@ -995,13 +996,12 @@ static inline VALUE vstate_get(struct generate_json_data *data)
995996
return data->vstate;
996997
}
997998

998-
struct hash_foreach_arg {
999-
VALUE hash;
1000-
struct generate_json_data *data;
1001-
int first_key_type;
1002-
bool first;
1003-
bool mixed_keys_encountered;
1004-
};
999+
static VALUE
1000+
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
1001+
{
1002+
VALUE proc_args[2] = {object, is_key};
1003+
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
1004+
}
10051005

10061006
static VALUE
10071007
convert_string_subclass(VALUE key)
@@ -1018,6 +1018,130 @@ convert_string_subclass(VALUE key)
10181018
return key_to_s;
10191019
}
10201020

1021+
1022+
static bool enc_utf8_compatible_p(int enc_idx)
1023+
{
1024+
if (enc_idx == usascii_encindex) return true;
1025+
if (enc_idx == utf8_encindex) return true;
1026+
return false;
1027+
}
1028+
1029+
static VALUE encode_json_string_try(VALUE str)
1030+
{
1031+
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
1032+
}
1033+
1034+
static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
1035+
{
1036+
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
1037+
return Qundef;
1038+
}
1039+
1040+
static inline bool valid_json_string_p(VALUE str)
1041+
{
1042+
int coderange = rb_enc_str_coderange(str);
1043+
1044+
if (RB_LIKELY(coderange == ENC_CODERANGE_7BIT)) {
1045+
return true;
1046+
}
1047+
1048+
if (RB_LIKELY(coderange == ENC_CODERANGE_VALID)) {
1049+
return enc_utf8_compatible_p(RB_ENCODING_GET_INLINED(str));
1050+
}
1051+
1052+
return false;
1053+
}
1054+
1055+
static inline VALUE ensure_valid_encoding(struct generate_json_data *data, VALUE str, bool as_json_called, bool is_key)
1056+
{
1057+
if (RB_LIKELY(valid_json_string_p(str))) {
1058+
return str;
1059+
}
1060+
1061+
if (!as_json_called && data->state->strict && RTEST(data->state->as_json)) {
1062+
VALUE coerced_str = json_call_as_json(data->state, str, Qfalse);
1063+
if (coerced_str != str) {
1064+
if (RB_TYPE_P(coerced_str, T_STRING)) {
1065+
if (!valid_json_string_p(coerced_str)) {
1066+
raise_generator_error(str, "source sequence is illegal/malformed utf-8");
1067+
}
1068+
} else {
1069+
// as_json could return another type than T_STRING
1070+
if (is_key) {
1071+
raise_generator_error(coerced_str, "%"PRIsVALUE" not allowed as object key in JSON", CLASS_OF(coerced_str));
1072+
}
1073+
}
1074+
1075+
return coerced_str;
1076+
}
1077+
}
1078+
1079+
if (RB_ENCODING_GET_INLINED(str) == binary_encindex) {
1080+
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
1081+
switch (rb_enc_str_coderange(utf8_string)) {
1082+
case ENC_CODERANGE_7BIT:
1083+
return utf8_string;
1084+
case ENC_CODERANGE_VALID:
1085+
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
1086+
// TODO: Raise in 3.0.0
1087+
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
1088+
return utf8_string;
1089+
break;
1090+
}
1091+
}
1092+
1093+
return rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
1094+
}
1095+
1096+
static void raw_generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1097+
{
1098+
fbuffer_append_char(buffer, '"');
1099+
1100+
long len;
1101+
search_state search;
1102+
search.buffer = buffer;
1103+
RSTRING_GETMEM(obj, search.ptr, len);
1104+
search.cursor = search.ptr;
1105+
search.end = search.ptr + len;
1106+
1107+
#ifdef HAVE_SIMD
1108+
search.matches_mask = 0;
1109+
search.has_matches = false;
1110+
search.chunk_base = NULL;
1111+
#endif /* HAVE_SIMD */
1112+
1113+
switch (rb_enc_str_coderange(obj)) {
1114+
case ENC_CODERANGE_7BIT:
1115+
case ENC_CODERANGE_VALID:
1116+
if (RB_UNLIKELY(data->state->ascii_only)) {
1117+
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
1118+
} else if (RB_UNLIKELY(data->state->script_safe)) {
1119+
convert_UTF8_to_script_safe_JSON(&search);
1120+
} else {
1121+
convert_UTF8_to_JSON(&search);
1122+
}
1123+
break;
1124+
default:
1125+
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1126+
break;
1127+
}
1128+
fbuffer_append_char(buffer, '"');
1129+
}
1130+
1131+
static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1132+
{
1133+
obj = ensure_valid_encoding(data, obj, false, false);
1134+
raw_generate_json_string(buffer, data, obj);
1135+
}
1136+
1137+
struct hash_foreach_arg {
1138+
VALUE hash;
1139+
struct generate_json_data *data;
1140+
int first_key_type;
1141+
bool first;
1142+
bool mixed_keys_encountered;
1143+
};
1144+
10211145
NOINLINE()
10221146
static void
10231147
json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
@@ -1034,13 +1158,6 @@ json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
10341158
}
10351159
}
10361160

1037-
static VALUE
1038-
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
1039-
{
1040-
VALUE proc_args[2] = {object, is_key};
1041-
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
1042-
}
1043-
10441161
static int
10451162
json_object_i(VALUE key, VALUE val, VALUE _arg)
10461163
{
@@ -1106,8 +1223,10 @@ json_object_i(VALUE key, VALUE val, VALUE _arg)
11061223
break;
11071224
}
11081225

1226+
key_to_s = ensure_valid_encoding(data, key_to_s, as_json_called, true);
1227+
11091228
if (RB_LIKELY(RBASIC_CLASS(key_to_s) == rb_cString)) {
1110-
generate_json_string(buffer, data, key_to_s);
1229+
raw_generate_json_string(buffer, data, key_to_s);
11111230
} else {
11121231
generate_json(buffer, data, key_to_s);
11131232
}
@@ -1190,85 +1309,6 @@ static void generate_json_array(FBuffer *buffer, struct generate_json_data *data
11901309
fbuffer_append_char(buffer, ']');
11911310
}
11921311

1193-
static inline int enc_utf8_compatible_p(int enc_idx)
1194-
{
1195-
if (enc_idx == usascii_encindex) return 1;
1196-
if (enc_idx == utf8_encindex) return 1;
1197-
return 0;
1198-
}
1199-
1200-
static VALUE encode_json_string_try(VALUE str)
1201-
{
1202-
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
1203-
}
1204-
1205-
static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
1206-
{
1207-
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
1208-
return Qundef;
1209-
}
1210-
1211-
static inline VALUE ensure_valid_encoding(VALUE str)
1212-
{
1213-
int encindex = RB_ENCODING_GET(str);
1214-
VALUE utf8_string;
1215-
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
1216-
if (encindex == binary_encindex) {
1217-
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
1218-
switch (rb_enc_str_coderange(utf8_string)) {
1219-
case ENC_CODERANGE_7BIT:
1220-
return utf8_string;
1221-
case ENC_CODERANGE_VALID:
1222-
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
1223-
// TODO: Raise in 3.0.0
1224-
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
1225-
return utf8_string;
1226-
break;
1227-
}
1228-
}
1229-
1230-
str = rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
1231-
}
1232-
return str;
1233-
}
1234-
1235-
static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1236-
{
1237-
obj = ensure_valid_encoding(obj);
1238-
1239-
fbuffer_append_char(buffer, '"');
1240-
1241-
long len;
1242-
search_state search;
1243-
search.buffer = buffer;
1244-
RSTRING_GETMEM(obj, search.ptr, len);
1245-
search.cursor = search.ptr;
1246-
search.end = search.ptr + len;
1247-
1248-
#ifdef HAVE_SIMD
1249-
search.matches_mask = 0;
1250-
search.has_matches = false;
1251-
search.chunk_base = NULL;
1252-
#endif /* HAVE_SIMD */
1253-
1254-
switch (rb_enc_str_coderange(obj)) {
1255-
case ENC_CODERANGE_7BIT:
1256-
case ENC_CODERANGE_VALID:
1257-
if (RB_UNLIKELY(data->state->ascii_only)) {
1258-
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
1259-
} else if (RB_UNLIKELY(data->state->script_safe)) {
1260-
convert_UTF8_to_script_safe_JSON(&search);
1261-
} else {
1262-
convert_UTF8_to_JSON(&search);
1263-
}
1264-
break;
1265-
default:
1266-
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1267-
break;
1268-
}
1269-
fbuffer_append_char(buffer, '"');
1270-
}
1271-
12721312
static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
12731313
{
12741314
VALUE tmp;
@@ -1405,10 +1445,20 @@ static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALU
14051445
if (klass != rb_cArray) goto general;
14061446
generate_json_array(buffer, data, obj);
14071447
break;
1408-
case T_STRING:
1448+
case T_STRING: {
14091449
if (klass != rb_cString) goto general;
1410-
generate_json_string(buffer, data, obj);
1450+
1451+
if (RB_LIKELY(valid_json_string_p(obj))) {
1452+
raw_generate_json_string(buffer, data, obj);
1453+
} else if (as_json_called) {
1454+
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1455+
} else {
1456+
obj = ensure_valid_encoding(data, obj, false, false);
1457+
as_json_called = true;
1458+
goto start;
1459+
}
14111460
break;
1461+
}
14121462
case T_SYMBOL:
14131463
generate_json_symbol(buffer, data, obj);
14141464
break;

test/json/json_coder_test.rb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,38 @@ def test_json_coder_dump_NaN_or_Infinity_loop
6767
assert_include error.message, "NaN not allowed in JSON"
6868
end
6969

70+
def test_json_coder_string_invalid_encoding
71+
coder = JSON::Coder.new do |object, is_key|
72+
object
73+
end
74+
75+
error = assert_raise JSON::GeneratorError do
76+
coder.dump("\xFF")
77+
end
78+
assert_equal "source sequence is illegal/malformed utf-8", error.message
79+
80+
error = assert_raise JSON::GeneratorError do
81+
coder.dump({ "\xFF" => 1 })
82+
end
83+
assert_equal "source sequence is illegal/malformed utf-8", error.message
84+
85+
coder = JSON::Coder.new do |object, is_key|
86+
object.bytes
87+
end
88+
89+
assert_equal "[255]", coder.dump("\xFF")
90+
error = assert_raise JSON::GeneratorError do
91+
coder.dump({ "\xFF" => 1 })
92+
end
93+
assert_equal "Array not allowed as object key in JSON", error.message
94+
95+
coder = JSON::Coder.new do |object, is_key|
96+
[object].pack("m")
97+
end
98+
assert_equal '"/w==\\n"', coder.dump("\xFF")
99+
assert_equal '{"/w==\\n":1}', coder.dump({ "\xFF" => 1 })
100+
end
101+
70102
def test_nesting_recovery
71103
coder = JSON::Coder.new
72104
ary = []

0 commit comments

Comments
 (0)