|
3 | 3 | // We assume the file in which it is include already includes
|
4 | 4 | // "simdjson/stage2.h" (this simplifies amalgation)
|
5 | 5 |
|
6 |
| -#include "generic/stage2/tape_writer.h" |
7 | 6 | #include "generic/stage2/logger.h"
|
8 |
| -#include "generic/stage2/atomparsing.h" |
9 | 7 | #include "generic/stage2/structural_iterator.h"
|
10 | 8 |
|
11 | 9 | namespace { // Make everything here private
|
12 | 10 | namespace SIMDJSON_IMPLEMENTATION {
|
13 | 11 | namespace stage2 {
|
14 | 12 |
|
| 13 | +#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } |
| 14 | + |
| 15 | +template<typename T> |
15 | 16 | struct structural_parser : structural_iterator {
|
16 |
| - /** Lets you append to the tape */ |
17 |
| - tape_writer tape; |
18 |
| - /** Next write location in the string buf for stage 2 parsing */ |
19 |
| - uint8_t *current_string_buf_loc; |
| 17 | + /** Receiver that actually parses the strings and builds the tape */ |
| 18 | + T builder; |
20 | 19 | /** Current depth (nested objects and arrays) */
|
21 | 20 | uint32_t depth{0};
|
22 | 21 |
|
23 | 22 | // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
|
24 | 23 | really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
|
25 | 24 | : structural_iterator(_parser, start_structural_index),
|
26 |
| - tape{parser.doc->tape.get()}, |
27 |
| - current_string_buf_loc{parser.doc->string_buf.get()} { |
28 |
| - } |
29 |
| - |
30 |
| - WARN_UNUSED really_inline error_code start_scope(bool is_array) { |
31 |
| - depth++; |
32 |
| - if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
33 |
| - parser.containing_scope[depth].tape_index = next_tape_index(); |
34 |
| - parser.containing_scope[depth].count = 0; |
35 |
| - tape.skip(); // We don't actually *write* the start element until the end. |
36 |
| - parser.is_array[depth] = is_array; |
37 |
| - return SUCCESS; |
| 25 | + builder{parser.doc->tape.get(), parser.doc->string_buf.get()} { |
38 | 26 | }
|
39 | 27 |
|
40 | 28 | WARN_UNUSED really_inline error_code start_document() {
|
41 |
| - log_start_value("document"); |
42 |
| - parser.containing_scope[depth].tape_index = next_tape_index(); |
43 |
| - parser.containing_scope[depth].count = 0; |
44 |
| - tape.skip(); // We don't actually *write* the start element until the end. |
| 29 | + builder.start_document(*this); |
45 | 30 | parser.is_array[depth] = false;
|
46 | 31 | return SUCCESS;
|
47 | 32 | }
|
48 |
| - |
49 | 33 | WARN_UNUSED really_inline error_code start_object() {
|
50 |
| - log_start_value("object"); |
51 |
| - return start_scope(false); |
| 34 | + depth++; |
| 35 | + if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
| 36 | + builder.start_object(*this); |
| 37 | + parser.is_array[depth] = false; |
| 38 | + return SUCCESS; |
52 | 39 | }
|
53 |
| - |
54 | 40 | WARN_UNUSED really_inline error_code start_array() {
|
55 |
| - log_start_value("array"); |
56 |
| - return start_scope(true); |
57 |
| - } |
58 |
| - |
59 |
| - // this function is responsible for annotating the start of the scope |
60 |
| - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { |
61 |
| - // SIMDJSON_ASSUME(depth > 0); |
62 |
| - // Write the ending tape element, pointing at the start location |
63 |
| - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; |
64 |
| - tape.append(start_tape_index, end); |
65 |
| - // Write the start tape element, pointing at the end location (and including count) |
66 |
| - // count can overflow if it exceeds 24 bits... so we saturate |
67 |
| - // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). |
68 |
| - const uint32_t count = parser.containing_scope[depth].count; |
69 |
| - const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; |
70 |
| - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); |
71 |
| - depth--; |
72 |
| - } |
73 |
| - |
74 |
| - really_inline uint32_t next_tape_index() { |
75 |
| - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); |
| 41 | + depth++; |
| 42 | + if (depth >= parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; } |
| 43 | + builder.start_array(*this); |
| 44 | + parser.is_array[depth] = true; |
| 45 | + return SUCCESS; |
76 | 46 | }
|
77 |
| - |
78 | 47 | really_inline void end_object() {
|
79 |
| - log_end_value("object"); |
80 |
| - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); |
| 48 | + builder.end_object(*this); |
| 49 | + depth--; |
81 | 50 | }
|
82 | 51 | really_inline void end_array() {
|
83 |
| - log_end_value("array"); |
84 |
| - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); |
| 52 | + builder.end_array(*this); |
| 53 | + depth--; |
85 | 54 | }
|
86 | 55 | really_inline void end_document() {
|
87 |
| - log_end_value("document"); |
88 |
| - constexpr uint32_t start_tape_index = 0; |
89 |
| - tape.append(start_tape_index, internal::tape_type::ROOT); |
90 |
| - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index(), internal::tape_type::ROOT); |
| 56 | + builder.end_document(*this); |
91 | 57 | }
|
92 | 58 |
|
93 |
| - really_inline void empty_container(internal::tape_type start, internal::tape_type end) { |
94 |
| - auto start_index = next_tape_index(); |
95 |
| - tape.append(start_index+2, start); |
96 |
| - tape.append(start_index, end); |
97 |
| - } |
98 | 59 | WARN_UNUSED really_inline bool empty_object() {
|
99 | 60 | if (peek_next_char() == '}') {
|
100 | 61 | advance_char();
|
101 |
| - log_value("empty object"); |
102 |
| - empty_container(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); |
| 62 | + builder.empty_object(*this); |
103 | 63 | return true;
|
104 | 64 | }
|
105 | 65 | return false;
|
106 | 66 | }
|
107 | 67 | WARN_UNUSED really_inline bool empty_array() {
|
108 | 68 | if (peek_next_char() == ']') {
|
109 | 69 | advance_char();
|
110 |
| - log_value("empty array"); |
111 |
| - empty_container(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); |
| 70 | + builder.empty_array(*this); |
112 | 71 | return true;
|
113 | 72 | }
|
114 | 73 | return false;
|
115 | 74 | }
|
116 | 75 |
|
117 |
| - // increment_count increments the count of keys in an object or values in an array. |
118 | 76 | really_inline void increment_count() {
|
119 |
| - parser.containing_scope[depth].count++; // we have a key value pair in the object at parser.depth - 1 |
120 |
| - } |
121 |
| - |
122 |
| - really_inline uint8_t *on_start_string() noexcept { |
123 |
| - // we advance the point, accounting for the fact that we have a NULL termination |
124 |
| - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); |
125 |
| - return current_string_buf_loc + sizeof(uint32_t); |
126 |
| - } |
127 |
| - |
128 |
| - really_inline void on_end_string(uint8_t *dst) noexcept { |
129 |
| - uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); |
130 |
| - // TODO check for overflow in case someone has a crazy string (>=4GB?) |
131 |
| - // But only add the overflow check when the document itself exceeds 4GB |
132 |
| - // Currently unneeded because we refuse to parse docs larger or equal to 4GB. |
133 |
| - memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t)); |
134 |
| - // NULL termination is still handy if you expect all your strings to |
135 |
| - // be NULL terminated? It comes at a small cost |
136 |
| - *dst = 0; |
137 |
| - current_string_buf_loc = dst + 1; |
| 77 | + builder.increment_count(*this); |
138 | 78 | }
|
139 | 79 |
|
140 | 80 | WARN_UNUSED really_inline error_code parse_key(const uint8_t *key) {
|
141 |
| - return parse_string(key, true); |
142 |
| - } |
143 |
| - WARN_UNUSED really_inline error_code parse_string(const uint8_t *value, bool key = false) { |
144 |
| - log_value(key ? "key" : "string"); |
145 |
| - uint8_t *dst = on_start_string(); |
146 |
| - dst = stringparsing::parse_string(value, dst); |
147 |
| - if (dst == nullptr) { |
148 |
| - log_error("Invalid escape in string"); |
149 |
| - return STRING_ERROR; |
150 |
| - } |
151 |
| - on_end_string(dst); |
152 |
| - return SUCCESS; |
| 81 | + return builder.parse_key(*this, key); |
| 82 | + } |
| 83 | + WARN_UNUSED really_inline error_code parse_string(const uint8_t *value) { |
| 84 | + return builder.parse_string(*this, value); |
153 | 85 | }
|
154 |
| - |
155 | 86 | WARN_UNUSED really_inline error_code parse_number(const uint8_t *value) {
|
156 |
| - log_value("number"); |
157 |
| - if (!numberparsing::parse_number(value, tape)) { log_error("Invalid number"); return NUMBER_ERROR; } |
158 |
| - return SUCCESS; |
| 87 | + return builder.parse_number(*this, value); |
159 | 88 | }
|
160 |
| - |
161 |
| - really_inline error_code parse_root_number(const uint8_t *value) { |
162 |
| - // |
163 |
| - // We need to make a copy to make sure that the string is space terminated. |
164 |
| - // This is not about padding the input, which should already padded up |
165 |
| - // to len + SIMDJSON_PADDING. However, we have no control at this stage |
166 |
| - // on how the padding was done. What if the input string was padded with nulls? |
167 |
| - // It is quite common for an input string to have an extra null character (C string). |
168 |
| - // We do not want to allow 9\0 (where \0 is the null character) inside a JSON |
169 |
| - // document, but the string "9\0" by itself is fine. So we make a copy and |
170 |
| - // pad the input with spaces when we know that there is just one input element. |
171 |
| - // This copy is relatively expensive, but it will almost never be called in |
172 |
| - // practice unless you are in the strange scenario where you have many JSON |
173 |
| - // documents made of single atoms. |
174 |
| - // |
175 |
| - uint8_t *copy = static_cast<uint8_t *>(malloc(remaining_len() + SIMDJSON_PADDING)); |
176 |
| - if (copy == nullptr) { |
177 |
| - return MEMALLOC; |
178 |
| - } |
179 |
| - memcpy(copy, value, remaining_len()); |
180 |
| - memset(copy + remaining_len(), ' ', SIMDJSON_PADDING); |
181 |
| - error_code error = parse_number(copy); |
182 |
| - free(copy); |
183 |
| - return error; |
| 89 | + WARN_UNUSED really_inline error_code parse_root_number(const uint8_t *value) { |
| 90 | + return builder.parse_root_number(*this, value); |
184 | 91 | }
|
185 |
| - |
186 | 92 | WARN_UNUSED really_inline error_code parse_true_atom(const uint8_t *value) {
|
187 |
| - log_value("true"); |
188 |
| - if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; } |
189 |
| - tape.append(0, internal::tape_type::TRUE_VALUE); |
190 |
| - return SUCCESS; |
| 93 | + return builder.parse_true_atom(*this, value); |
191 | 94 | }
|
192 |
| - |
193 | 95 | WARN_UNUSED really_inline error_code parse_root_true_atom(const uint8_t *value) {
|
194 |
| - log_value("true"); |
195 |
| - if (!atomparsing::is_valid_true_atom(value, remaining_len())) { return T_ATOM_ERROR; } |
196 |
| - tape.append(0, internal::tape_type::TRUE_VALUE); |
197 |
| - return SUCCESS; |
| 96 | + return builder.parse_root_true_atom(*this, value); |
198 | 97 | }
|
199 |
| - |
200 | 98 | WARN_UNUSED really_inline error_code parse_false_atom(const uint8_t *value) {
|
201 |
| - log_value("false"); |
202 |
| - if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; } |
203 |
| - tape.append(0, internal::tape_type::FALSE_VALUE); |
204 |
| - return SUCCESS; |
| 99 | + return builder.parse_false_atom(*this, value); |
205 | 100 | }
|
206 |
| - |
207 | 101 | WARN_UNUSED really_inline error_code parse_root_false_atom(const uint8_t *value) {
|
208 |
| - log_value("false"); |
209 |
| - if (!atomparsing::is_valid_false_atom(value, remaining_len())) { return F_ATOM_ERROR; } |
210 |
| - tape.append(0, internal::tape_type::FALSE_VALUE); |
211 |
| - return SUCCESS; |
| 102 | + return builder.parse_root_false_atom(*this, value); |
212 | 103 | }
|
213 |
| - |
214 | 104 | WARN_UNUSED really_inline error_code parse_null_atom(const uint8_t *value) {
|
215 |
| - log_value("null"); |
216 |
| - if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; } |
217 |
| - tape.append(0, internal::tape_type::NULL_VALUE); |
218 |
| - return SUCCESS; |
| 105 | + return builder.parse_null_atom(*this, value); |
219 | 106 | }
|
220 |
| - |
221 | 107 | WARN_UNUSED really_inline error_code parse_root_null_atom(const uint8_t *value) {
|
222 |
| - log_value("null"); |
223 |
| - if (!atomparsing::is_valid_null_atom(value, remaining_len())) { return N_ATOM_ERROR; } |
224 |
| - tape.append(0, internal::tape_type::NULL_VALUE); |
225 |
| - return SUCCESS; |
| 108 | + return builder.parse_root_null_atom(*this, value); |
226 | 109 | }
|
227 | 110 |
|
228 | 111 | WARN_UNUSED really_inline error_code start() {
|
@@ -266,12 +149,20 @@ struct structural_parser : structural_iterator {
|
266 | 149 | }
|
267 | 150 | }; // struct structural_parser
|
268 | 151 |
|
269 |
| -#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } } |
| 152 | +} // namespace stage2 |
| 153 | +} // namespace SIMDJSON_IMPLEMENTATION |
| 154 | +} // unnamed namespace |
| 155 | + |
| 156 | +#include "generic/stage2/tape_builder.h" |
| 157 | + |
| 158 | +namespace { // Make everything here private |
| 159 | +namespace SIMDJSON_IMPLEMENTATION { |
| 160 | +namespace stage2 { |
270 | 161 |
|
271 | 162 | template<bool STREAMING>
|
272 | 163 | WARN_UNUSED static really_inline error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
|
273 | 164 | dom_parser.doc = &doc;
|
274 |
| - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); |
| 165 | + stage2::structural_parser<stage2::tape_builder> parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); |
275 | 166 | SIMDJSON_TRY( parser.start() );
|
276 | 167 |
|
277 | 168 | //
|
|
0 commit comments