1
1
#include <stdio.h>
2
2
#include <string.h>
3
3
#include <stdlib.h>
4
+ #include <stdbool.h>
4
5
5
6
#include "tokenize.h"
6
7
#include "util.h"
7
8
8
-
9
9
// Internal functions
10
- static token_list_t * allocate_token_list ();
11
- static token_t * reallocate_tokens (token_t * tokens , token_length_t length );
12
-
13
- static void append_token (token_list_t * token_list , token_type_t token_type , size_t pos_start );
10
+ static void append_token (dynarray2_t * token_list , token_type_t token_type , size_t pos_start );
14
11
12
+ static bool has_escape_seq (char * str , size_t offset );
15
13
static void read_string (char * * str_ptr , char * * read_str_ptr );
16
14
static void read_number (char * * str_ptr , void * * read_number_ptr , token_type_t * type_ptr );
17
15
static int * read_bool (char * * str_ptr );
@@ -22,48 +20,51 @@ static int read_null(char** str_ptr);
22
20
// - 1e25 number notation
23
21
// - should the tokenizer know about delimiters (",}]") after values? (falsenull or "string""string" and other things)
24
22
// - testcase: fill memory with garbage and check what tokenize returns
25
- token_list_t * tokenize (const char * str )
23
+ dynarray2_t * tokenize (const char * str )
26
24
{
27
- token_list_t * result = allocate_token_list ();
28
- result -> tokens = NULL ;
29
- result -> length = 0 ;
25
+ dynarray2_t * token_list = dynarray2_create (sizeof (token_t ));
26
+ token_t * current_token = NULL ;
30
27
31
28
char * cur = (char * ) str ;
32
29
size_t tmp_pos ;
33
30
31
+ char err_buf [1024 ] = {0 };
32
+
34
33
while (* cur ) {
35
34
int * bool_value_ptr ;
36
35
int null_result ;
37
36
38
37
switch (* cur ) {
39
38
case '{' :
40
- append_token (result , TokenCurly , cur - str );
39
+ append_token (token_list , TokenCurly , cur - str );
41
40
break ;
42
41
43
42
case '}' :
44
- append_token (result , TokenUncurly , cur - str );
43
+ append_token (token_list , TokenUncurly , cur - str );
45
44
break ;
46
45
47
46
case '[' :
48
- append_token (result , TokenSquare , cur - str );
47
+ append_token (token_list , TokenSquare , cur - str );
49
48
break ;
50
49
51
50
case ']' :
52
- append_token (result , TokenUnsquare , cur - str );
51
+ append_token (token_list , TokenUnsquare , cur - str );
53
52
break ;
54
53
55
54
case ':' :
56
- append_token (result , TokenColon , cur - str );
55
+ append_token (token_list , TokenColon , cur - str );
57
56
break ;
58
57
59
58
case ',' :
60
- append_token (result , TokenComma , cur - str );
59
+ append_token (token_list , TokenComma , cur - str );
61
60
break ;
62
61
63
62
case '\"' :
64
- append_token (result , TokenString , cur - str );
65
- read_string (& cur , (char * * ) & result -> tokens [result -> length - 1 ].value_ptr );
66
- result -> tokens [result -> length - 1 ]._pos_end = cur - str ;
63
+ append_token (token_list , TokenString , cur - str );
64
+ current_token = dynarray2_get_top (token_list );
65
+ read_string (& cur , (char * * ) & current_token -> value_ptr );
66
+ current_token -> _pos_end = cur - str ;
67
+ current_token = NULL ;
67
68
break ;
68
69
69
70
case '.' :
@@ -78,9 +79,11 @@ token_list_t* tokenize(const char* str)
78
79
case '7' :
79
80
case '8' :
80
81
case '9' :
81
- append_token (result , TokenLong , cur - str );
82
- read_number (& cur , & result -> tokens [result -> length - 1 ].value_ptr , & result -> tokens [result -> length - 1 ].type );
83
- result -> tokens [result -> length - 1 ]._pos_end = cur - str ;
82
+ append_token (token_list , TokenLong , cur - str );
83
+ current_token = dynarray2_get_top (token_list );
84
+ read_number (& cur , & current_token -> value_ptr , & current_token -> type );
85
+ current_token -> _pos_end = cur - str ;
86
+ current_token = NULL ;
84
87
break ;
85
88
86
89
// TODO fix falsenull and other insanities
@@ -93,21 +96,24 @@ token_list_t* tokenize(const char* str)
93
96
die ("tokenize: expected true or false literal after \"%c\" in input" , * cur );
94
97
}
95
98
96
- append_token (result , TokenBool , tmp_pos );
97
- result -> tokens [result -> length - 1 ].value_ptr = bool_value_ptr ;
98
- result -> tokens [result -> length - 1 ]._pos_end = cur - str ;
99
+ append_token (token_list , TokenBool , tmp_pos );
100
+ current_token = dynarray2_get_top (token_list );
101
+ current_token -> value_ptr = bool_value_ptr ;
102
+ current_token -> _pos_end = cur - str ;
103
+ current_token = NULL ;
99
104
break ;
100
105
101
106
case 'n' :
102
107
tmp_pos = cur ;
103
108
104
109
null_result = read_null (& cur );
105
110
if (-1 == null_result ) {
106
- die ("tokenize: expected null literal after \"%c\" in input" , * cur );
111
+ die ("tokenize: expected null literal after \"%c\" in input in position %d " , * cur , cur - str );
107
112
}
108
113
109
- append_token (result , TokenNull , tmp_pos );
110
- result -> tokens [result -> length - 1 ]._pos_end = cur - str ;
114
+ append_token (token_list , TokenNull , tmp_pos );
115
+ current_token = dynarray2_get_top (token_list );
116
+ current_token -> _pos_end = cur - str ;
111
117
break ;
112
118
113
119
case ' ' :
@@ -117,53 +123,43 @@ token_list_t* tokenize(const char* str)
117
123
break ;
118
124
119
125
default :
120
- die ("tokenize: unexpected char \"%c\" in input in position %d" , * cur , cur - str );
126
+ strncpy (err_buf , (str + (cur - str - 20 )), 41 );
127
+ die ("tokenize: unexpected char \"%c\" in input in position %d\n%s" , * cur , cur - str , err_buf );
121
128
}
122
129
123
130
cur ++ ;
124
131
}
125
132
126
- return result ;
133
+ return token_list ;
127
134
}
128
135
129
- static token_list_t * allocate_token_list ()
130
- {
131
- void * ptr = malloc (sizeof (token_list_t ));
132
- if (ptr == NULL ) {
133
- die ("allocate_token_list: failed to allocated memory" );
134
- }
135
-
136
- return ptr ;
137
- }
138
-
139
- void free_token_list (token_list_t * token_list )
140
- {
141
- for (token_length_t i = 0 ; i < token_list -> length ; i ++ ) {
142
- free (token_list -> tokens [i ].value_ptr );
143
- }
144
- free (token_list -> tokens );
145
- }
146
-
147
- // TODO allocate more in advance (estimate from the input length)
148
- static token_t * reallocate_tokens (token_t * tokens , token_length_t length )
136
+ // Internal functions
137
+ static void append_token (dynarray2_t * token_list , token_type_t token_type , size_t pos_start )
149
138
{
150
- void * ptr = realloc (tokens , length * sizeof (token_t ));
151
-
152
- if (ptr == NULL ) {
153
- die ("reallocate_token: failed to allocate memory" );
154
- }
155
-
156
- return ptr ;
139
+ token_t value = {
140
+ .type = token_type ,
141
+ .value_ptr = NULL ,
142
+ ._pos_start = pos_start
143
+ };
144
+ dynarray2_append (token_list , & value );
157
145
}
158
146
159
- // Internal functions
160
- static void append_token ( token_list_t * token_list , token_type_t token_type , size_t pos_start )
147
+ // TODO this must be mindful of the string boundaries!
148
+ static bool has_escape_seq ( char * str , size_t offset )
161
149
{
162
- token_list -> tokens = reallocate_tokens (token_list -> tokens , token_list -> length + 1 );
163
- token_list -> tokens [token_list -> length ].type = token_type ;
164
- token_list -> tokens [token_list -> length ].value_ptr = NULL ;
165
- token_list -> tokens [token_list -> length ]._pos_start = pos_start ;
166
- token_list -> length += 1 ;
150
+ char * start = str + offset ;
151
+
152
+ return (
153
+ strncmp (start , "\\\\" , 2 ) == 0 ||
154
+ // strncmp(start, "\\\"", 2) == 0 ||
155
+ strncmp (start , "\\/" , 2 ) == 0 ||
156
+ strncmp (start , "\\b" , 2 ) == 0 ||
157
+ strncmp (start , "\\f" , 2 ) == 0 ||
158
+ strncmp (start , "\\n" , 2 ) == 0 ||
159
+ strncmp (start , "\\r" , 2 ) == 0 ||
160
+ strncmp (start , "\\t" , 2 ) == 0
161
+ // TODO add \uXXXX
162
+ );
167
163
}
168
164
169
165
// Comes null terminated
@@ -179,7 +175,7 @@ static void read_string(char** str_ptr, char** read_str_ptr)
179
175
}
180
176
181
177
// Escaped \" are allowed in strings
182
- if (* * str_ptr == '\"' && * (* str_ptr - 1 ) != '\\' ) {
178
+ if (* * str_ptr == '\"' && * (* str_ptr - 1 ) != '\\' && ! has_escape_seq ( * str_ptr , -2 ) ) {
183
179
end = * str_ptr ;
184
180
* read_str_ptr = malloc (((end - start ) + 1 ) * sizeof (char ));
185
181
* * read_str_ptr = '\0' ;
0 commit comments