Skip to content

Commit 51a88cc

Browse files
committed
Simplify tokenize by switching to dynarray2
1 parent 8cb50f6 commit 51a88cc

File tree

5 files changed

+74
-82
lines changed

5 files changed

+74
-82
lines changed

json.c

+6-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include "tokenize.h"
77
#include "parse.h"
88
#include "serialize.h"
9-
#include "dynarray.h"
9+
#include "dynarray2.h"
1010
#include "read_file.h"
1111

1212
int main(int argc, char** argv)
@@ -26,13 +26,13 @@ int main(int argc, char** argv)
2626

2727
// printf("Input file length = %zu\n", input->buffer_len);
2828

29-
token_list_t* tokens = tokenize(input->buffer);
29+
dynarray2_t* tokens = tokenize(input->buffer);
3030

31-
printf("\n\ntoken list length = %llu\n", tokens->length);
32-
if (tokens->length > 0) {
31+
printf("\n\ntoken list length = %zu\n", tokens->len);
32+
if (tokens->len > 0) {
3333
printf("TOKENS:\n\n");
34-
for (token_length_t i = 0; i < tokens->length; i++) {
35-
DEBUG_print_token(&tokens->tokens[i]);
34+
for (size_t i = 0; i < tokens->len; i++) {
35+
DEBUG_print_token(dynarray2_get(tokens, i));
3636
}
3737
printf("\n");
3838
}

parse.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
typedef struct {
1212
node_t* root;
1313
dynarray_t* node_stack;
14-
token_list_t* token_list;
14+
dynarray2_t* token_list;
1515
token_length_t token_current;
1616
} parse_state_t;
1717

@@ -49,7 +49,7 @@ static node_t* _allocate_node(node_type_t type, void* value)
4949

5050
static inline token_t* _token_current(parse_state_t* state)
5151
{
52-
return &state->token_list->tokens[state->token_current];
52+
return dynarray2_get(state->token_list, state->token_current);
5353
}
5454

5555
static inline void _inc_token_current(parse_state_t* state)
@@ -122,7 +122,7 @@ static node_t* _consume_value(parse_state_t* state)
122122
return node;
123123
}
124124

125-
node_t* parse(token_list_t* token_list, const char* input_str)
125+
node_t* parse(dynarray2_t* token_list, const char* input_str)
126126
{
127127
parse_state_t state = {
128128
.root = NULL,
@@ -131,7 +131,7 @@ node_t* parse(token_list_t* token_list, const char* input_str)
131131
.token_list = token_list
132132
};
133133

134-
while (state.token_current < state.token_list->length) {
134+
while (state.token_current < state.token_list->len) {
135135

136136
node_t* node_top = NULL;
137137

parse.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ typedef struct {
1313
void* value;
1414
} node_t;
1515

16-
node_t* parse(token_list_t* token_list, const char* input_str);
16+
node_t* parse(dynarray2_t* token_list, const char* input_str);
1717
void free_node_list(node_t* node);
1818

1919
#endif //PARSE_H

tokenize.c

+60-64
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
#include <stdio.h>
22
#include <string.h>
33
#include <stdlib.h>
4+
#include <stdbool.h>
45

56
#include "tokenize.h"
67
#include "util.h"
78

8-
99
// Internal functions
10-
static token_list_t* allocate_token_list();
11-
static token_t* reallocate_tokens(token_t* tokens, token_length_t length);
12-
13-
static void append_token(token_list_t* token_list, token_type_t token_type, size_t pos_start);
10+
static void append_token(dynarray2_t* token_list, token_type_t token_type, size_t pos_start);
1411

12+
static bool has_escape_seq(char* str, size_t offset);
1513
static void read_string(char** str_ptr, char** read_str_ptr);
1614
static void read_number(char** str_ptr, void** read_number_ptr, token_type_t* type_ptr);
1715
static int* read_bool(char** str_ptr);
@@ -22,48 +20,51 @@ static int read_null(char** str_ptr);
2220
// - 1e25 number notation
2321
// - should the tokenizer know about delimiters (",}]") after values? (falsenull or "string""string" and other things)
2422
// - testcase: fill memory with garbage and check what tokenize returns
25-
token_list_t* tokenize(const char* str)
23+
dynarray2_t* tokenize(const char* str)
2624
{
27-
token_list_t* result = allocate_token_list();
28-
result->tokens = NULL;
29-
result->length = 0;
25+
dynarray2_t* token_list = dynarray2_create(sizeof(token_t));
26+
token_t* current_token = NULL;
3027

3128
char* cur = (char*) str;
3229
size_t tmp_pos;
3330

31+
char err_buf[1024] = {0};
32+
3433
while (*cur) {
3534
int* bool_value_ptr;
3635
int null_result;
3736

3837
switch (*cur) {
3938
case '{':
40-
append_token(result, TokenCurly, cur - str);
39+
append_token(token_list, TokenCurly, cur - str);
4140
break;
4241

4342
case '}':
44-
append_token(result, TokenUncurly, cur - str);
43+
append_token(token_list, TokenUncurly, cur - str);
4544
break;
4645

4746
case '[':
48-
append_token(result, TokenSquare, cur - str);
47+
append_token(token_list, TokenSquare, cur - str);
4948
break;
5049

5150
case ']':
52-
append_token(result, TokenUnsquare, cur - str);
51+
append_token(token_list, TokenUnsquare, cur - str);
5352
break;
5453

5554
case ':':
56-
append_token(result, TokenColon, cur - str);
55+
append_token(token_list, TokenColon, cur - str);
5756
break;
5857

5958
case ',':
60-
append_token(result, TokenComma, cur - str);
59+
append_token(token_list, TokenComma, cur - str);
6160
break;
6261

6362
case '\"':
64-
append_token(result, TokenString, cur - str);
65-
read_string(&cur, (char**) &result->tokens[result->length - 1].value_ptr);
66-
result->tokens[result->length-1]._pos_end = cur - str;
63+
append_token(token_list, TokenString, cur - str);
64+
current_token = dynarray2_get_top(token_list);
65+
read_string(&cur, (char**) &current_token->value_ptr);
66+
current_token->_pos_end = cur - str;
67+
current_token = NULL;
6768
break;
6869

6970
case '.':
@@ -78,9 +79,11 @@ token_list_t* tokenize(const char* str)
7879
case '7':
7980
case '8':
8081
case '9':
81-
append_token(result, TokenLong, cur - str);
82-
read_number(&cur, &result->tokens[result->length - 1].value_ptr, &result->tokens[result->length - 1].type);
83-
result->tokens[result->length-1]._pos_end = cur - str;
82+
append_token(token_list, TokenLong, cur - str);
83+
current_token = dynarray2_get_top(token_list);
84+
read_number(&cur, &current_token->value_ptr, &current_token->type);
85+
current_token->_pos_end = cur - str;
86+
current_token = NULL;
8487
break;
8588

8689
// TODO fix falsenull and other insanities
@@ -93,21 +96,24 @@ token_list_t* tokenize(const char* str)
9396
die("tokenize: expected true or false literal after \"%c\" in input", *cur);
9497
}
9598

96-
append_token(result, TokenBool, tmp_pos);
97-
result->tokens[result->length - 1].value_ptr = bool_value_ptr;
98-
result->tokens[result->length-1]._pos_end = cur - str;
99+
append_token(token_list, TokenBool, tmp_pos);
100+
current_token = dynarray2_get_top(token_list);
101+
current_token->value_ptr = bool_value_ptr;
102+
current_token->_pos_end = cur - str;
103+
current_token = NULL;
99104
break;
100105

101106
case 'n':
102107
tmp_pos = cur;
103108

104109
null_result = read_null(&cur);
105110
if (-1 == null_result) {
106-
die("tokenize: expected null literal after \"%c\" in input", *cur);
111+
die("tokenize: expected null literal after \"%c\" in input in position %d", *cur, cur - str);
107112
}
108113

109-
append_token(result, TokenNull, tmp_pos);
110-
result->tokens[result->length-1]._pos_end = cur - str;
114+
append_token(token_list, TokenNull, tmp_pos);
115+
current_token = dynarray2_get_top(token_list);
116+
current_token->_pos_end = cur - str;
111117
break;
112118

113119
case ' ':
@@ -117,53 +123,43 @@ token_list_t* tokenize(const char* str)
117123
break;
118124

119125
default:
120-
die("tokenize: unexpected char \"%c\" in input in position %d", *cur, cur - str);
126+
strncpy(err_buf, (str + (cur - str - 20)), 41);
127+
die("tokenize: unexpected char \"%c\" in input in position %d\n%s", *cur, cur - str, err_buf);
121128
}
122129

123130
cur++;
124131
}
125132

126-
return result;
133+
return token_list;
127134
}
128135

129-
static token_list_t* allocate_token_list()
130-
{
131-
void *ptr = malloc(sizeof(token_list_t));
132-
if (ptr == NULL) {
133-
die("allocate_token_list: failed to allocated memory");
134-
}
135-
136-
return ptr;
137-
}
138-
139-
void free_token_list(token_list_t* token_list)
140-
{
141-
for (token_length_t i = 0; i < token_list->length; i++) {
142-
free(token_list->tokens[i].value_ptr);
143-
}
144-
free(token_list->tokens);
145-
}
146-
147-
// TODO allocate more in advance (estimate from the input length)
148-
static token_t* reallocate_tokens(token_t* tokens, token_length_t length)
136+
// Internal functions
137+
static void append_token(dynarray2_t* token_list, token_type_t token_type, size_t pos_start)
149138
{
150-
void* ptr = realloc(tokens, length * sizeof(token_t));
151-
152-
if (ptr == NULL) {
153-
die("reallocate_token: failed to allocate memory");
154-
}
155-
156-
return ptr;
139+
token_t value = {
140+
.type = token_type,
141+
.value_ptr = NULL,
142+
._pos_start = pos_start
143+
};
144+
dynarray2_append(token_list, &value);
157145
}
158146

159-
// Internal functions
160-
static void append_token(token_list_t* token_list, token_type_t token_type, size_t pos_start)
147+
// TODO this must be mindful of the string boundaries!
148+
static bool has_escape_seq(char* str, size_t offset)
161149
{
162-
token_list->tokens = reallocate_tokens(token_list->tokens, token_list->length + 1);
163-
token_list->tokens[token_list->length].type = token_type;
164-
token_list->tokens[token_list->length].value_ptr = NULL;
165-
token_list->tokens[token_list->length]._pos_start = pos_start;
166-
token_list->length += 1;
150+
char* start = str + offset;
151+
152+
return (
153+
strncmp(start, "\\\\", 2) == 0 ||
154+
// strncmp(start, "\\\"", 2) == 0 ||
155+
strncmp(start, "\\/", 2) == 0 ||
156+
strncmp(start, "\\b", 2) == 0 ||
157+
strncmp(start, "\\f", 2) == 0 ||
158+
strncmp(start, "\\n", 2) == 0 ||
159+
strncmp(start, "\\r", 2) == 0 ||
160+
strncmp(start, "\\t", 2) == 0
161+
// TODO add \uXXXX
162+
);
167163
}
168164

169165
// Comes null terminated
@@ -179,7 +175,7 @@ static void read_string(char** str_ptr, char** read_str_ptr)
179175
}
180176

181177
// Escaped \" are allowed in strings
182-
if (**str_ptr == '\"' && *(*str_ptr - 1) != '\\') {
178+
if (**str_ptr == '\"' && *(*str_ptr - 1) != '\\' && !has_escape_seq(*str_ptr, -2)) {
183179
end = *str_ptr;
184180
*read_str_ptr = malloc(((end - start) + 1) * sizeof(char));
185181
**read_str_ptr = '\0';

tokenize.h

+3-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef TOKENIZE_H
22
#define TOKENIZE_H
33

4+
#include "dynarray2.h"
5+
46
typedef enum {
57
TokenCurly, TokenUncurly, TokenSquare, TokenUnsquare,
68
TokenColon, TokenComma,
@@ -18,13 +20,7 @@ typedef struct {
1820

1921
typedef unsigned long long token_length_t;
2022

21-
typedef struct {
22-
token_length_t length;
23-
token_t* tokens;
24-
} token_list_t;
25-
26-
token_list_t* tokenize(const char* str);
27-
void free_token_list(token_list_t* token_list);
23+
dynarray2_t* tokenize(const char* str);
2824

2925
void DEBUG_print_token(token_t* token);
3026

0 commit comments

Comments
 (0)