Skip to content

Commit 9c4db31

Browse files
simibyroot
andcommitted
Add ryu float parser.
Co-Authored-By: Jean Boussier <[email protected]>
1 parent 965ba6c commit 9c4db31

File tree

6 files changed

+1091
-57
lines changed

6 files changed

+1091
-57
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
### Unreleased
44

5+
* Optimized floating point number parsing by integrating the ryu algorithm.
6+
57
### 2025-10-25 (2.15.2)
68

79
* Fix `JSON::Coder` to have one dedicated depth counter per invocation.

LEGAL

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,15 @@
66
All the files in this distribution are covered under either the Ruby's
77
license (see the file COPYING) or public-domain except some files
88
mentioned below.
9+
10+
ext/json/ext/vendor/fpconv.h::
11+
This files is adapted from https://github.com/night-shift/fpconv
12+
It is licensed under Boost Software License 1.0.
13+
14+
ext/json/ext/vendor/jeaiii-ltoa.h::
15+
This files is adapted from https://github.com/jeaiii/itoa
16+
It is licensed under the MIT License
17+
18+
ext/json/ext/vendor/ryu.h::
19+
This file is adapted from the Ryu algorithm by Ulf Adams https://github.com/ulfjack/ryu.
20+
It is dual-licensed under Apache License 2.0 OR Boost Software License 1.0.

benchmark/parser.rb

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,4 @@ def benchmark_parsing(name, json_output)
5757
benchmark_parsing "activitypub.json", File.read("#{__dir__}/data/activitypub.json")
5858
benchmark_parsing "twitter.json", File.read("#{__dir__}/data/twitter.json")
5959
benchmark_parsing "citm_catalog.json", File.read("#{__dir__}/data/citm_catalog.json")
60-
61-
# rapidjson is 8x faster thanks to its much more performant float parser.
62-
# Unfortunately, there isn't a lot of existing fast float parsers in pure C,
63-
# and including C++ is problematic.
64-
# Aside from that, we're close to the alternatives here.
6560
benchmark_parsing "float parsing", File.read("#{__dir__}/data/canada.json")

ext/json/ext/parser/parser.c

Lines changed: 89 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "ruby.h"
22
#include "ruby/encoding.h"
3+
#include "../vendor/ryu.h"
34

45
/* shims */
56
/* This is the fallback definition from Ruby 3.4 */
@@ -20,6 +21,16 @@ typedef unsigned char _Bool;
2021
#endif
2122
#endif
2223

24+
#if SIZEOF_UINT64_T == SIZEOF_LONG_LONG
25+
# define INT64T2NUM(x) LL2NUM(x)
26+
# define UINT64T2NUM(x) ULL2NUM(x)
27+
#elif SIZEOF_UINT64_T == SIZEOF_LONG
28+
# define INT64T2NUM(x) LONG2NUM(x)
29+
# define UINT64T2NUM(x) ULONG2NUM(x)
30+
#else
31+
# error No uint64_t conversion
32+
#endif
33+
2334
#include "../simd/simd.h"
2435

2536
#ifndef RB_UNLIKELY
@@ -755,26 +766,6 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
755766
}
756767

757768
#define MAX_FAST_INTEGER_SIZE 18
758-
static inline VALUE fast_decode_integer(const char *p, const char *pe)
759-
{
760-
bool negative = false;
761-
if (*p == '-') {
762-
negative = true;
763-
p++;
764-
}
765-
766-
long long memo = 0;
767-
while (p < pe) {
768-
memo *= 10;
769-
memo += *p - '0';
770-
p++;
771-
}
772-
773-
if (negative) {
774-
memo = -memo;
775-
}
776-
return LL2NUM(memo);
777-
}
778769

779770
static VALUE json_decode_large_integer(const char *start, long len)
780771
{
@@ -788,17 +779,27 @@ static VALUE json_decode_large_integer(const char *start, long len)
788779
}
789780

790781
static inline VALUE
791-
json_decode_integer(const char *start, const char *end)
782+
json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
792783
{
793-
long len = end - start;
794-
if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
795-
return fast_decode_integer(start, end);
784+
if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
785+
if (negative) {
786+
return INT64T2NUM(-((int64_t)mantissa));
787+
}
788+
return UINT64T2NUM(mantissa);
796789
}
797-
return json_decode_large_integer(start, len);
790+
791+
return json_decode_large_integer(start, end - start);
798792
}
799793

800794
static VALUE json_decode_large_float(const char *start, long len)
801795
{
796+
if (RB_LIKELY(len < 64)) {
797+
char buffer[64];
798+
MEMCPY(buffer, start, char, len);
799+
buffer[len] = '\0';
800+
return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
801+
}
802+
802803
VALUE buffer_v;
803804
char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
804805
MEMCPY(buffer, start, char, len);
@@ -808,21 +809,24 @@ static VALUE json_decode_large_float(const char *start, long len)
808809
return number;
809810
}
810811

811-
static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
812+
/* Ruby JSON optimized float decoder using vendored Ryu algorithm
813+
* Accepts pre-extracted mantissa and exponent from first-pass validation
814+
*/
815+
static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
816+
const char *start, const char *end)
812817
{
813-
long len = end - start;
814-
815818
if (RB_UNLIKELY(config->decimal_class)) {
816-
VALUE text = rb_str_new(start, len);
819+
VALUE text = rb_str_new(start, end - start);
817820
return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
818-
} else if (RB_LIKELY(len < 64)) {
819-
char buffer[64];
820-
MEMCPY(buffer, start, char, len);
821-
buffer[len] = '\0';
822-
return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
823-
} else {
824-
return json_decode_large_float(start, len);
825821
}
822+
823+
// Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
824+
// Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
825+
if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
826+
return json_decode_large_float(start, end - start);
827+
}
828+
829+
return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
826830
}
827831

828832
static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -1082,57 +1086,90 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
10821086
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
10831087
bool integer = true;
10841088

1089+
// Variables for Ryu optimization - extract digits during parsing
1090+
uint64_t mantissa = 0;
1091+
int mantissa_digits = 0;
1092+
int32_t exponent = 0;
1093+
bool negative = false;
1094+
int decimal_point_pos = -1;
1095+
10851096
// /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
10861097
const char *start = state->cursor;
1087-
state->cursor++;
10881098

1089-
while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1099+
// Handle optional negative sign
1100+
if (*state->cursor == '-') {
1101+
negative = true;
10901102
state->cursor++;
1103+
if (state->cursor >= state->end || !rb_isdigit(*state->cursor)) {
1104+
raise_parse_error_at("invalid number: %s", state, start);
1105+
}
10911106
}
10921107

1093-
long integer_length = state->cursor - start;
1108+
// Parse integer part and extract mantissa digits
1109+
while ((state->cursor < state->end) && rb_isdigit(*state->cursor)) {
1110+
mantissa = mantissa * 10 + (*state->cursor - '0');
1111+
mantissa_digits++;
1112+
state->cursor++;
1113+
}
10941114

1095-
if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
1115+
if (RB_UNLIKELY(start[0] == '0' && mantissa_digits > 1)) {
10961116
raise_parse_error_at("invalid number: %s", state, start);
1097-
} else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
1098-
raise_parse_error_at("invalid number: %s", state, start);
1099-
} else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
1117+
} else if (RB_UNLIKELY(mantissa_digits > 1 && negative && start[1] == '0')) {
11001118
raise_parse_error_at("invalid number: %s", state, start);
11011119
}
11021120

1121+
// Parse fractional part
11031122
if ((state->cursor < state->end) && (*state->cursor == '.')) {
11041123
integer = false;
1124+
decimal_point_pos = mantissa_digits; // Remember position of decimal point
11051125
state->cursor++;
11061126

1107-
if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1108-
raise_parse_error("invalid number: %s", state);
1127+
if (state->cursor == state->end || !rb_isdigit(*state->cursor)) {
1128+
raise_parse_error_at("invalid number: %s", state, start);
11091129
}
11101130

1111-
while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1131+
while ((state->cursor < state->end) && rb_isdigit(*state->cursor)) {
1132+
mantissa = mantissa * 10 + (*state->cursor - '0');
1133+
mantissa_digits++;
11121134
state->cursor++;
11131135
}
11141136
}
11151137

1138+
// Parse exponent
11161139
if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
11171140
integer = false;
11181141
state->cursor++;
1119-
if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
1142+
1143+
bool negative_exponent = false;
1144+
if ((state->cursor < state->end) && ((*state->cursor == '-') || (*state->cursor == '+'))) {
1145+
negative_exponent = (*state->cursor == '-');
11201146
state->cursor++;
11211147
}
11221148

1123-
if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1124-
raise_parse_error("invalid number: %s", state);
1149+
if (state->cursor == state->end || !rb_isdigit(*state->cursor)) {
1150+
raise_parse_error_at("invalid number: %s", state, start);
11251151
}
11261152

1127-
while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1153+
while ((state->cursor < state->end) && rb_isdigit(*state->cursor)) {
1154+
exponent = exponent * 10 + (*state->cursor - '0');
11281155
state->cursor++;
11291156
}
1157+
1158+
if (negative_exponent) {
1159+
exponent = -exponent;
1160+
}
11301161
}
11311162

11321163
if (integer) {
1133-
return json_push_value(state, config, json_decode_integer(start, state->cursor));
1164+
return json_push_value(state, config, json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor));
11341165
}
1135-
return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1166+
1167+
// Adjust exponent based on decimal point position
1168+
if (decimal_point_pos >= 0) {
1169+
exponent -= (mantissa_digits - decimal_point_pos);
1170+
}
1171+
1172+
return json_push_value(state, config, json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor));
11361173
}
11371174
case '"': {
11381175
// %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}

0 commit comments

Comments
 (0)