Skip to content

Commit 5a1e04b

Browse files
authored
Merge pull request #877 from byroot/json-coder-encoding
Invoke `as_json` callback for strings with invalid encoding
2 parents 9bb1dc6 + b1b16c4 commit 5a1e04b

File tree

7 files changed

+279
-104
lines changed

7 files changed

+279
-104
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
### Unreleased
44

5+
* `JSON::Coder` now also yields to the block when encountering strings with invalid encoding.
56
* Fix GeneratorError messages to be UTF-8 encoded.
67
* Fix memory leak when `Exception` is raised, or `throw` is used during JSON generation.
78
* Optimized floating point number parsing by integrating the ryu algorithm (thanks to Josef Šimánek).

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,23 @@ puts MyApp::API_JSON_CODER.dump(Time.now.utc) # => "2025-01-21T08:41:44.286Z"
113113
The provided block is called for all objects that don't have a native JSON equivalent, and
114114
must return a Ruby object that has a native JSON equivalent.
115115

116-
It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`.
116+
It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`,
117+
as well as for strings that aren't valid UTF-8:
118+
119+
```ruby
120+
coder = JSON::Combining.new do |object, is_object_key|
121+
case object
122+
when String
123+
if !string.valid_encoding? || string.encoding != Encoding::UTF_8
124+
Base64.encode64(string)
125+
else
126+
string
127+
end
128+
else
129+
object
130+
end
131+
end
132+
```
117133

118134
## Combining JSON fragments
119135

ext/json/ext/generator/generator.c

Lines changed: 142 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -996,13 +996,12 @@ static inline VALUE vstate_get(struct generate_json_data *data)
996996
return data->vstate;
997997
}
998998

999-
struct hash_foreach_arg {
1000-
VALUE hash;
1001-
struct generate_json_data *data;
1002-
int first_key_type;
1003-
bool first;
1004-
bool mixed_keys_encountered;
1005-
};
999+
static VALUE
1000+
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
1001+
{
1002+
VALUE proc_args[2] = {object, is_key};
1003+
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
1004+
}
10061005

10071006
static VALUE
10081007
convert_string_subclass(VALUE key)
@@ -1019,6 +1018,129 @@ convert_string_subclass(VALUE key)
10191018
return key_to_s;
10201019
}
10211020

1021+
static bool enc_utf8_compatible_p(int enc_idx)
1022+
{
1023+
if (enc_idx == usascii_encindex) return true;
1024+
if (enc_idx == utf8_encindex) return true;
1025+
return false;
1026+
}
1027+
1028+
static VALUE encode_json_string_try(VALUE str)
1029+
{
1030+
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
1031+
}
1032+
1033+
static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
1034+
{
1035+
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
1036+
return Qundef;
1037+
}
1038+
1039+
static inline bool valid_json_string_p(VALUE str)
1040+
{
1041+
int coderange = rb_enc_str_coderange(str);
1042+
1043+
if (RB_LIKELY(coderange == ENC_CODERANGE_7BIT)) {
1044+
return true;
1045+
}
1046+
1047+
if (RB_LIKELY(coderange == ENC_CODERANGE_VALID)) {
1048+
return enc_utf8_compatible_p(RB_ENCODING_GET_INLINED(str));
1049+
}
1050+
1051+
return false;
1052+
}
1053+
1054+
static inline VALUE ensure_valid_encoding(struct generate_json_data *data, VALUE str, bool as_json_called, bool is_key)
1055+
{
1056+
if (RB_LIKELY(valid_json_string_p(str))) {
1057+
return str;
1058+
}
1059+
1060+
if (!as_json_called && data->state->strict && RTEST(data->state->as_json)) {
1061+
VALUE coerced_str = json_call_as_json(data->state, str, Qfalse);
1062+
if (coerced_str != str) {
1063+
if (RB_TYPE_P(coerced_str, T_STRING)) {
1064+
if (!valid_json_string_p(coerced_str)) {
1065+
raise_generator_error(str, "source sequence is illegal/malformed utf-8");
1066+
}
1067+
} else {
1068+
// as_json could return another type than T_STRING
1069+
if (is_key) {
1070+
raise_generator_error(coerced_str, "%"PRIsVALUE" not allowed as object key in JSON", CLASS_OF(coerced_str));
1071+
}
1072+
}
1073+
1074+
return coerced_str;
1075+
}
1076+
}
1077+
1078+
if (RB_ENCODING_GET_INLINED(str) == binary_encindex) {
1079+
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
1080+
switch (rb_enc_str_coderange(utf8_string)) {
1081+
case ENC_CODERANGE_7BIT:
1082+
return utf8_string;
1083+
case ENC_CODERANGE_VALID:
1084+
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
1085+
// TODO: Raise in 3.0.0
1086+
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
1087+
return utf8_string;
1088+
break;
1089+
}
1090+
}
1091+
1092+
return rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
1093+
}
1094+
1095+
static void raw_generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1096+
{
1097+
fbuffer_append_char(buffer, '"');
1098+
1099+
long len;
1100+
search_state search;
1101+
search.buffer = buffer;
1102+
RSTRING_GETMEM(obj, search.ptr, len);
1103+
search.cursor = search.ptr;
1104+
search.end = search.ptr + len;
1105+
1106+
#ifdef HAVE_SIMD
1107+
search.matches_mask = 0;
1108+
search.has_matches = false;
1109+
search.chunk_base = NULL;
1110+
#endif /* HAVE_SIMD */
1111+
1112+
switch (rb_enc_str_coderange(obj)) {
1113+
case ENC_CODERANGE_7BIT:
1114+
case ENC_CODERANGE_VALID:
1115+
if (RB_UNLIKELY(data->state->ascii_only)) {
1116+
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
1117+
} else if (RB_UNLIKELY(data->state->script_safe)) {
1118+
convert_UTF8_to_script_safe_JSON(&search);
1119+
} else {
1120+
convert_UTF8_to_JSON(&search);
1121+
}
1122+
break;
1123+
default:
1124+
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1125+
break;
1126+
}
1127+
fbuffer_append_char(buffer, '"');
1128+
}
1129+
1130+
static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1131+
{
1132+
obj = ensure_valid_encoding(data, obj, false, false);
1133+
raw_generate_json_string(buffer, data, obj);
1134+
}
1135+
1136+
struct hash_foreach_arg {
1137+
VALUE hash;
1138+
struct generate_json_data *data;
1139+
int first_key_type;
1140+
bool first;
1141+
bool mixed_keys_encountered;
1142+
};
1143+
10221144
NOINLINE()
10231145
static void
10241146
json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
@@ -1035,13 +1157,6 @@ json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
10351157
}
10361158
}
10371159

1038-
static VALUE
1039-
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
1040-
{
1041-
VALUE proc_args[2] = {object, is_key};
1042-
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
1043-
}
1044-
10451160
static int
10461161
json_object_i(VALUE key, VALUE val, VALUE _arg)
10471162
{
@@ -1107,8 +1222,10 @@ json_object_i(VALUE key, VALUE val, VALUE _arg)
11071222
break;
11081223
}
11091224

1225+
key_to_s = ensure_valid_encoding(data, key_to_s, as_json_called, true);
1226+
11101227
if (RB_LIKELY(RBASIC_CLASS(key_to_s) == rb_cString)) {
1111-
generate_json_string(buffer, data, key_to_s);
1228+
raw_generate_json_string(buffer, data, key_to_s);
11121229
} else {
11131230
generate_json(buffer, data, key_to_s);
11141231
}
@@ -1191,85 +1308,6 @@ static void generate_json_array(FBuffer *buffer, struct generate_json_data *data
11911308
fbuffer_append_char(buffer, ']');
11921309
}
11931310

1194-
static inline int enc_utf8_compatible_p(int enc_idx)
1195-
{
1196-
if (enc_idx == usascii_encindex) return 1;
1197-
if (enc_idx == utf8_encindex) return 1;
1198-
return 0;
1199-
}
1200-
1201-
static VALUE encode_json_string_try(VALUE str)
1202-
{
1203-
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
1204-
}
1205-
1206-
static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
1207-
{
1208-
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
1209-
return Qundef;
1210-
}
1211-
1212-
static inline VALUE ensure_valid_encoding(VALUE str)
1213-
{
1214-
int encindex = RB_ENCODING_GET(str);
1215-
VALUE utf8_string;
1216-
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
1217-
if (encindex == binary_encindex) {
1218-
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
1219-
switch (rb_enc_str_coderange(utf8_string)) {
1220-
case ENC_CODERANGE_7BIT:
1221-
return utf8_string;
1222-
case ENC_CODERANGE_VALID:
1223-
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
1224-
// TODO: Raise in 3.0.0
1225-
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
1226-
return utf8_string;
1227-
break;
1228-
}
1229-
}
1230-
1231-
str = rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
1232-
}
1233-
return str;
1234-
}
1235-
1236-
static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
1237-
{
1238-
obj = ensure_valid_encoding(obj);
1239-
1240-
fbuffer_append_char(buffer, '"');
1241-
1242-
long len;
1243-
search_state search;
1244-
search.buffer = buffer;
1245-
RSTRING_GETMEM(obj, search.ptr, len);
1246-
search.cursor = search.ptr;
1247-
search.end = search.ptr + len;
1248-
1249-
#ifdef HAVE_SIMD
1250-
search.matches_mask = 0;
1251-
search.has_matches = false;
1252-
search.chunk_base = NULL;
1253-
#endif /* HAVE_SIMD */
1254-
1255-
switch (rb_enc_str_coderange(obj)) {
1256-
case ENC_CODERANGE_7BIT:
1257-
case ENC_CODERANGE_VALID:
1258-
if (RB_UNLIKELY(data->state->ascii_only)) {
1259-
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
1260-
} else if (RB_UNLIKELY(data->state->script_safe)) {
1261-
convert_UTF8_to_script_safe_JSON(&search);
1262-
} else {
1263-
convert_UTF8_to_JSON(&search);
1264-
}
1265-
break;
1266-
default:
1267-
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1268-
break;
1269-
}
1270-
fbuffer_append_char(buffer, '"');
1271-
}
1272-
12731311
static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
12741312
{
12751313
VALUE tmp;
@@ -1408,7 +1446,16 @@ static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALU
14081446
break;
14091447
case T_STRING:
14101448
if (klass != rb_cString) goto general;
1411-
generate_json_string(buffer, data, obj);
1449+
1450+
if (RB_LIKELY(valid_json_string_p(obj))) {
1451+
raw_generate_json_string(buffer, data, obj);
1452+
} else if (as_json_called) {
1453+
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
1454+
} else {
1455+
obj = ensure_valid_encoding(data, obj, false, false);
1456+
as_json_called = true;
1457+
goto start;
1458+
}
14121459
break;
14131460
case T_SYMBOL:
14141461
generate_json_symbol(buffer, data, obj);

java/src/json/ext/Generator.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ private static IRubyObject castKey(ThreadContext context, IRubyObject key) {
606606
}
607607

608608
private static void processEntry(ThreadContext context, Session session, OutputStream buffer, RubyHash.RubyHashEntry entry, boolean firstPair, ByteList objectNl, byte[] indent, ByteList spaceBefore, ByteList space) {
609+
StringEncoder encoder = session.getStringEncoder(context);
610+
609611
IRubyObject key = (IRubyObject) entry.getKey();
610612
IRubyObject value = (IRubyObject) entry.getValue();
611613

@@ -619,7 +621,7 @@ private static void processEntry(ThreadContext context, Session session, OutputS
619621
Ruby runtime = context.runtime;
620622

621623
IRubyObject keyStr = castKey(context, key);
622-
if (keyStr == null || !(keyStr instanceof RubyString)) {
624+
if (keyStr == null || !(keyStr instanceof RubyString) || !encoder.hasValidEncoding((RubyString)keyStr)) {
623625
GeneratorState state = session.getState(context);
624626
if (state.strict()) {
625627
if (state.getAsJSON() != null) {
@@ -664,6 +666,19 @@ int guessSize(ThreadContext context, Session session, RubyString object) {
664666

665667
@Override
666668
void generate(ThreadContext context, Session session, RubyString object, OutputStream buffer) throws IOException {
669+
GeneratorState state = session.getState(context);
670+
StringEncoder encoder = session.getStringEncoder(context);
671+
672+
if (state.strict() && !encoder.hasValidEncoding(object) && state.getAsJSON() != null) {
673+
IRubyObject value = state.getAsJSON().call(context, object, context.getRuntime().getFalse());
674+
if (value instanceof RubyString) {
675+
object = (RubyString)value;
676+
} else {
677+
Handler handler = getHandlerFor(context.runtime, value);
678+
handler.generate(context, session, value, buffer);
679+
return;
680+
}
681+
}
667682
generateString(context, session, object, buffer);
668683
}
669684
}

java/src/json/ext/StringEncoder.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,17 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
208208
append('"');
209209
}
210210

211+
static boolean hasValidEncoding(RubyString str) {
212+
switch (str.scanForCodeRange()) {
213+
case StringSupport.CR_7BIT:
214+
return true;
215+
case StringSupport.CR_VALID:
216+
return str.getEncoding() == UTF8Encoding.INSTANCE || str.getEncoding() == USASCIIEncoding.INSTANCE;
217+
default:
218+
return false;
219+
}
220+
}
221+
211222
static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
212223
Encoding encoding = str.getEncoding();
213224

0 commit comments

Comments
 (0)