Skip to content

Commit

Permalink
Merge pull request #226 from jow-/lexer-improvements
Browse files Browse the repository at this point in the history
Enhance lexer functionality and improve token reporting
  • Loading branch information
jow- authored Sep 23, 2024
2 parents fa22732 + 2b2e732 commit 9cf53dd
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 15 deletions.
12 changes: 11 additions & 1 deletion compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,17 @@ uc_compiler_parse_advance(uc_compiler_t *compiler)
compiler->parser->prev = compiler->parser->curr;

while (true) {
compiler->parser->curr = *uc_lexer_next_token(&compiler->parser->lex);
uc_token_t *tok = uc_lexer_next_token(&compiler->parser->lex);

if (tok->type == TK_COMMENT || tok->type == TK_LSTM) {
ucv_put(tok->uv);
continue;
}
else if (tok->type == TK_RSTM) {
tok->type = TK_SCOL;
}

compiler->parser->curr = *tok;

if (compiler->parser->curr.type != TK_ERROR)
break;
Expand Down
9 changes: 6 additions & 3 deletions include/ucode/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,14 @@ typedef enum {
TK_EXPORT,

TK_EOF,
TK_COMMENT,
TK_ERROR
} uc_tokentype_t;

typedef enum {
UC_LEX_IDENTIFY_BLOCK,
UC_LEX_BLOCK_EXPRESSION_EMIT_TAG,
UC_LEX_BLOCK_STATEMENT_EMIT_TAG,
UC_LEX_BLOCK_COMMENT,
UC_LEX_IDENTIFY_TOKEN,
UC_LEX_PLACEHOLDER_START,
Expand All @@ -138,6 +140,7 @@ typedef struct {
uc_tokentype_t type;
uc_value_t *uv;
size_t pos;
size_t end;
} uc_token_t;

typedef struct {
Expand Down Expand Up @@ -174,10 +177,10 @@ typedef struct {
} uc_lexer_t;


__hidden void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source);
__hidden void uc_lexer_free(uc_lexer_t *lex);
void uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source);
void uc_lexer_free(uc_lexer_t *lex);

__hidden uc_token_t *uc_lexer_next_token(uc_lexer_t *lex);
uc_token_t *uc_lexer_next_token(uc_lexer_t *lex);

__hidden bool uc_lexer_is_keyword(uc_value_t *label);

Expand Down
44 changes: 33 additions & 11 deletions lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
else
lex->curr.pos = (size_t)pos;

lex->curr.end = lex->source->off;

return &lex->curr;
}

Expand Down Expand Up @@ -172,16 +174,23 @@ emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_c
static uc_token_t *
parse_comment(uc_lexer_t *lex, int kind)
{
size_t off = lex->source->off - 1;
int ch;

uc_vector_push(&lex->buffer, '/');

while (true) {
ch = next_char(lex);

uc_vector_push(&lex->buffer, ch);

if (kind == '/' && (ch == '\n' || ch == EOF))
break;

if (kind == '*' && ch == '*' && check_char(lex, '/'))
if (kind == '*' && ch == '*' && check_char(lex, '/')) {
uc_vector_push(&lex->buffer, '/');
break;
}

if (ch == EOF) {
lex->state = UC_LEX_EOF;
Expand All @@ -190,7 +199,7 @@ parse_comment(uc_lexer_t *lex, int kind)
}
}

return NULL;
return emit_buffer(lex, off, TK_COMMENT, NULL);
}

static void
Expand Down Expand Up @@ -338,7 +347,7 @@ parse_escape(uc_lexer_t *lex, const char *regex_macros)
static uc_token_t *
parse_string(uc_lexer_t *lex, int kind)
{
uc_token_t *err;
uc_token_t *err, *tok;
unsigned type;
int code, ch;
size_t off;
Expand All @@ -359,7 +368,10 @@ parse_string(uc_lexer_t *lex, int kind)
if (type == TK_TEMPLATE && check_char(lex, '{')) {
lex->state = UC_LEX_PLACEHOLDER_START;

return emit_buffer(lex, off, type, NULL);
tok = emit_buffer(lex, off, type, NULL);
tok->end -= 2;

return tok;
}

uc_vector_push(&lex->buffer, '$');
Expand Down Expand Up @@ -952,8 +964,7 @@ lex_step(uc_lexer_t *lex)

/* found start of statement block */
case '%':
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = STATEMENTS;
lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG;

if (check_char(lex, '-'))
strip = " \n\t\v\f\r";
Expand Down Expand Up @@ -987,6 +998,8 @@ lex_step(uc_lexer_t *lex)
if (!tok)
continue;

tok->end -= 2;

return tok;


Expand All @@ -1012,18 +1025,24 @@ lex_step(uc_lexer_t *lex)
return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
}

tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL);

lex->lastoff = lex->source->off;
lex->state = UC_LEX_IDENTIFY_BLOCK;

continue;

return tok;

case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = EXPRESSION;

return emit_op(lex, lex->source->off, TK_LEXP, NULL);
return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);

case UC_LEX_BLOCK_STATEMENT_EMIT_TAG:
lex->state = UC_LEX_IDENTIFY_TOKEN;
lex->block = STATEMENTS;

return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL);

case UC_LEX_IDENTIFY_TOKEN:
do { tok = lex_find_token(lex); } while (tok == NULL);
Expand All @@ -1042,7 +1061,7 @@ lex_step(uc_lexer_t *lex)
lex->state = UC_LEX_IDENTIFY_BLOCK;
lex->block = NONE;

tok = emit_op(lex, -2, TK_SCOL, NULL);
tok = emit_op(lex, -2, TK_RSTM, NULL);
}

/* found end of expression block */
Expand Down Expand Up @@ -1092,7 +1111,10 @@ lex_step(uc_lexer_t *lex)
case UC_LEX_PLACEHOLDER_END:
lex->state = UC_LEX_IDENTIFY_TOKEN;

return parse_string(lex, '`');
tok = parse_string(lex, '`');
tok->pos++;

return tok;


case UC_LEX_EOF:
Expand Down

0 comments on commit 9cf53dd

Please sign in to comment.