Skip to content

Commit 03d54f8

Browse files
committed
Use SAX model for stage 2
1 parent 553e6d7 commit 03d54f8

File tree

3 files changed

+246
-160
lines changed

3 files changed

+246
-160
lines changed

src/generic/stage2/logger.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ namespace logger {
2828
if (LOG_ENABLED) {
2929
log_depth = 0;
3030
printf("\n");
31-
printf("| %-*s | %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#", 5, "Tape#");
32-
printf("|%.*s|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES, 5+2, DASHES);
31+
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
32+
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
3333
}
3434
}
3535

@@ -71,7 +71,7 @@ namespace logger {
7171
} else {
7272
printf("| %-*s ", LOG_INDEX_LEN, "");
7373
}
74-
printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
74+
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
7575
printf("| %-s ", detail);
7676
printf("|\n");
7777
}

src/generic/stage2/structural_parser.h

Lines changed: 48 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -3,226 +3,109 @@
33
// We assume the file in which it is include already includes
44
// "simdjson/stage2.h" (this simplifies amalgation)
55

6-
#include "generic/stage2/tape_writer.h"
76
#include "generic/stage2/logger.h"
8-
#include "generic/stage2/atomparsing.h"
97
#include "generic/stage2/structural_iterator.h"
108

119
namespace { // Make everything here private
1210
namespace SIMDJSON_IMPLEMENTATION {
1311
namespace stage2 {
1412

13+
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
14+
15+
template<typename T>
1516
struct structural_parser : structural_iterator {
16-
/** Lets you append to the tape */
17-
tape_writer tape;
18-
/** Next write location in the string buf for stage 2 parsing */
19-
uint8_t *current_string_buf_loc;
17+
/** Receiver that actually parses the strings and builds the tape */
18+
T builder;
2019
/** Current depth (nested objects and arrays) */
2120
uint32_t depth{0};
2221

2322
// For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
2423
really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
2524
: structural_iterator(_parser, start_structural_index),
26-
tape{parser.doc->tape.get()},
27-
current_string_buf_loc{parser.doc->string_buf.get()} {
28-
}
29-
30-
WARN_UNUSED really_inline error_code start_scope(bool is_array) {
31-
depth++;
32-
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
33-
parser.containing_scope[depth].tape_index = next_tape_index();
34-
parser.containing_scope[depth].count = 0;
35-
tape.skip(); // We don't actually *write* the start element until the end.
36-
parser.is_array[depth] = is_array;
37-
return SUCCESS;
25+
builder{parser.doc->tape.get(), parser.doc->string_buf.get()} {
3826
}
3927

4028
WARN_UNUSED really_inline error_code start_document() {
41-
log_start_value("document");
42-
parser.containing_scope[depth].tape_index = next_tape_index();
43-
parser.containing_scope[depth].count = 0;
44-
tape.skip(); // We don't actually *write* the start element until the end.
29+
builder.start_document(*this);
4530
parser.is_array[depth] = false;
4631
return SUCCESS;
4732
}
48-
4933
WARN_UNUSED really_inline error_code start_object() {
50-
log_start_value("object");
51-
return start_scope(false);
34+
depth++;
35+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
36+
builder.start_object(*this);
37+
parser.is_array[depth] = false;
38+
return SUCCESS;
5239
}
53-
5440
WARN_UNUSED really_inline error_code start_array() {
55-
log_start_value("array");
56-
return start_scope(true);
57-
}
58-
59-
// this function is responsible for annotating the start of the scope
60-
really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
61-
// SIMDJSON_ASSUME(depth > 0);
62-
// Write the ending tape element, pointing at the start location
63-
const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
64-
tape.append(start_tape_index, end);
65-
// Write the start tape element, pointing at the end location (and including count)
66-
// count can overflow if it exceeds 24 bits... so we saturate
67-
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
68-
const uint32_t count = parser.containing_scope[depth].count;
69-
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
70-
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
71-
depth--;
72-
}
73-
74-
really_inline uint32_t next_tape_index() {
75-
return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
41+
depth++;
42+
if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
43+
builder.start_array(*this);
44+
parser.is_array[depth] = true;
45+
return SUCCESS;
7646
}
77-
7847
really_inline void end_object() {
79-
log_end_value("object");
80-
end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
48+
builder.end_object(*this);
49+
depth--;
8150
}
8251
really_inline void end_array() {
83-
log_end_value("array");
84-
end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
52+
builder.end_array(*this);
53+
depth--;
8554
}
8655
really_inline void end_document() {
87-
log_end_value("document");
88-
constexpr uint32_t start_tape_index = 0;
89-
tape.append(start_tape_index, internal::tape_type::ROOT);
90-
tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT);
56+
builder.end_document(*this);
9157
}
9258

93-
really_inline void empty_container(internal::tape_type start, internal::tape_type end) {
94-
auto start_index = next_tape_index();
95-
tape.append(start_index+2, start);
96-
tape.append(start_index, end);
97-
}
9859
WARN_UNUSED really_inline bool empty_object() {
9960
if (peek_next_char() == '}') {
10061
advance_char();
101-
log_value("empty object");
102-
empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
62+
builder.empty_object(*this);
10363
return true;
10464
}
10565
return false;
10666
}
10767
WARN_UNUSED really_inline bool empty_array() {
10868
if (peek_next_char() == ']') {
10969
advance_char();
110-
log_value("empty array");
111-
empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
70+
builder.empty_array(*this);
11271
return true;
11372
}
11473
return false;
11574
}
11675

117-
// increment_count increments the count of keys in an object or values in an array.
11876
really_inline void increment_count() {
119-
parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1
120-
}
121-
122-
really_inline uint8_t *on_start_string() noexcept {
123-
// we advance the point, accounting for the fact that we have a NULL termination
124-
tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
125-
return current_string_buf_loc + sizeof(uint32_t);
126-
}
127-
128-
really_inline void on_end_string(uint8_t *dst) noexcept {
129-
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
130-
// TODO check for overflow in case someone has a crazy string (>=4GB?)
131-
// But only add the overflow check when the document itself exceeds 4GB
132-
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
133-
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
134-
// NULL termination is still handy if you expect all your strings to
135-
// be NULL terminated? It comes at a small cost
136-
*dst = 0;
137-
current_string_buf_loc = dst + 1;
77+
builder.increment_count(*this);
13878
}
13979

14080
WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) {
141-
return parse_string(key, true);
142-
}
143-
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) {
144-
log_value(key ? "key" : "string");
145-
uint8_t *dst = on_start_string();
146-
dst = stringparsing::parse_string(value, dst);
147-
if (dst == nullptr) {
148-
log_error("Invalid escape in string");
149-
return STRING_ERROR;
150-
}
151-
on_end_string(dst);
152-
return SUCCESS;
81+
return builder.parse_key(*this, key);
82+
}
83+
WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) {
84+
return builder.parse_string(*this, value);
15385
}
154-
15586
WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) {
156-
log_value("number");
157-
if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; }
158-
return SUCCESS;
87+
return builder.parse_number(*this, value);
15988
}
160-
161-
really_inline error_code parse_root_number(const uint8_t *value) {
162-
//
163-
// We need to make a copy to make sure that the string is space terminated.
164-
// This is not about padding the input, which should already padded up
165-
// to len + SIMDJSON_PADDING. However, we have no control at this stage
166-
// on how the padding was done. What if the input string was padded with nulls?
167-
// It is quite common for an input string to have an extra null character (C string).
168-
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
169-
// document, but the string "9\0" by itself is fine. So we make a copy and
170-
// pad the input with spaces when we know that there is just one input element.
171-
// This copy is relatively expensive, but it will almost never be called in
172-
// practice unless you are in the strange scenario where you have many JSON
173-
// documents made of single atoms.
174-
//
175-
uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING));
176-
if (copy == nullptr) {
177-
return MEMALLOC;
178-
}
179-
memcpy(copy, value, remaining_len());
180-
memset(copy + remaining_len(), ' ', SIMDJSON_PADDING);
181-
error_code error = parse_number(copy);
182-
free(copy);
183-
return error;
89+
WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) {
90+
return builder.parse_root_number(*this, value);
18491
}
185-
18692
WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) {
187-
log_value("true");
188-
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
189-
tape.append(0, internal::tape_type::TRUE_VALUE);
190-
return SUCCESS;
93+
return builder.parse_true_atom(*this, value);
19194
}
192-
19395
WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) {
194-
log_value("true");
195-
if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; }
196-
tape.append(0, internal::tape_type::TRUE_VALUE);
197-
return SUCCESS;
96+
return builder.parse_root_true_atom(*this, value);
19897
}
199-
20098
WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) {
201-
log_value("false");
202-
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
203-
tape.append(0, internal::tape_type::FALSE_VALUE);
204-
return SUCCESS;
99+
return builder.parse_false_atom(*this, value);
205100
}
206-
207101
WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) {
208-
log_value("false");
209-
if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; }
210-
tape.append(0, internal::tape_type::FALSE_VALUE);
211-
return SUCCESS;
102+
return builder.parse_root_false_atom(*this, value);
212103
}
213-
214104
WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) {
215-
log_value("null");
216-
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
217-
tape.append(0, internal::tape_type::NULL_VALUE);
218-
return SUCCESS;
105+
return builder.parse_null_atom(*this, value);
219106
}
220-
221107
WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) {
222-
log_value("null");
223-
if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; }
224-
tape.append(0, internal::tape_type::NULL_VALUE);
225-
return SUCCESS;
108+
return builder.parse_root_null_atom(*this, value);
226109
}
227110

228111
WARN_UNUSED really_inline error_code start() {
@@ -266,12 +149,20 @@ struct structural_parser : structural_iterator {
266149
}
267150
}; // struct structural_parser
268151

269-
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
152+
} // namespace stage2
153+
} // namespace SIMDJSON_IMPLEMENTATION
154+
} // unnamed namespace
155+
156+
#include "generic/stage2/tape_builder.h"
157+
158+
namespace { // Make everything here private
159+
namespace SIMDJSON_IMPLEMENTATION {
160+
namespace stage2 {
270161

271162
template<bool STREAMING>
272163
WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
273164
dom_parser.doc = &doc;
274-
stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
165+
stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
275166
SIMDJSON_TRY( parser.start() );
276167

277168
//

0 commit comments

Comments
 (0)