Simplify the lexer
This commit is contained in:
parent
383c9d8fd2
commit
c5cd74d910
111
ell.c
111
ell.c
|
@ -23,7 +23,6 @@
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <assert.h>
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <setjmp.h>
|
#include <setjmp.h>
|
||||||
|
|
||||||
|
@ -215,9 +214,7 @@ struct lexer {
|
||||||
/// Input has to be null-terminated anyway
|
/// Input has to be null-terminated anyway
|
||||||
static void
|
static void
|
||||||
lexer_init (struct lexer *self, const char *p, size_t len) {
|
lexer_init (struct lexer *self, const char *p, size_t len) {
|
||||||
memset (self, 0, sizeof *self);
|
*self = (struct lexer) { .p = p, .len = len };
|
||||||
self->p = p;
|
|
||||||
self->len = len;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -225,11 +222,6 @@ lexer_free (struct lexer *self) {
|
||||||
free (self->string.s);
|
free (self->string.s);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool lexer_is_ignored (int c) { return strchr (" \t", c); }
|
|
||||||
static bool lexer_is_word_char (int c) {
|
|
||||||
return !lexer_is_ignored (c) && !strchr ("()[]{}\n;@#'", c);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
lexer_advance (struct lexer *self) {
|
lexer_advance (struct lexer *self) {
|
||||||
int c = *self->p++;
|
int c = *self->p++;
|
||||||
|
@ -245,58 +237,38 @@ lexer_advance (struct lexer *self) {
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
lexer_hexa_escape (struct lexer *self, struct buffer *output) {
|
lexer_hexa_escape (struct lexer *self, struct buffer *output) {
|
||||||
int i;
|
const char *alphabet = "0123456789abcdef", *h, *l;
|
||||||
unsigned char code = 0;
|
if (!self->len || !(h = strchr (alphabet, tolower (lexer_advance (self))))
|
||||||
|
|| !self->len || !(l = strchr (alphabet, tolower (lexer_advance (self)))))
|
||||||
for (i = 0; self->len && i < 2; i++) {
|
|
||||||
unsigned char c = tolower (*self->p);
|
|
||||||
if (c >= '0' && c <= '9')
|
|
||||||
code = (code << 4) | (c - '0');
|
|
||||||
else if (c >= 'a' && c <= 'f')
|
|
||||||
code = (code << 4) | (c - 'a' + 10);
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
|
|
||||||
lexer_advance (self);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!i)
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
buffer_append_c (output, code);
|
buffer_append_c (output, (h - alphabet) << 4 | (l - alphabet));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum { LEXER_STRING_QUOTE = '\'', LEXER_ESCAPE = '\\', LEXER_COMMENT = '#' };
|
||||||
|
static bool lexer_is_whitespace (int c) { return !c || c == ' ' || c == '\t'; }
|
||||||
|
|
||||||
|
static unsigned char lexer_escapes[256] = {
|
||||||
|
[LEXER_STRING_QUOTE] = LEXER_STRING_QUOTE, [LEXER_ESCAPE] = LEXER_ESCAPE,
|
||||||
|
['a'] = '\a', ['b'] = '\b', ['n'] = '\n', ['r'] = '\r', ['t'] = '\t',
|
||||||
|
};
|
||||||
|
|
||||||
static const char *
|
static const char *
|
||||||
lexer_escape_sequence (struct lexer *self, struct buffer *output) {
|
lexer_escape_sequence (struct lexer *self, struct buffer *output) {
|
||||||
if (!self->len)
|
if (!self->len)
|
||||||
return "premature end of escape sequence";
|
return "premature end of escape sequence";
|
||||||
|
|
||||||
unsigned char c = *self->p;
|
unsigned char c = lexer_advance (self);
|
||||||
switch (c) {
|
if (c == 'x') {
|
||||||
case '"': break;
|
|
||||||
case '\\': break;
|
|
||||||
case 'a': c = '\a'; break;
|
|
||||||
case 'b': c = '\b'; break;
|
|
||||||
case 'f': c = '\f'; break;
|
|
||||||
case 'n': c = '\n'; break;
|
|
||||||
case 'r': c = '\r'; break;
|
|
||||||
case 't': c = '\t'; break;
|
|
||||||
case 'v': c = '\v'; break;
|
|
||||||
|
|
||||||
case 'x':
|
|
||||||
case 'X':
|
|
||||||
lexer_advance (self);
|
|
||||||
if (lexer_hexa_escape (self, output))
|
if (lexer_hexa_escape (self, output))
|
||||||
return NULL;
|
return NULL;
|
||||||
return "invalid hexadecimal escape";
|
return "invalid hexadecimal escape";
|
||||||
|
|
||||||
default:
|
|
||||||
return "unknown escape sequence";
|
|
||||||
}
|
}
|
||||||
|
if (!(c = lexer_escapes[c]))
|
||||||
|
return "unknown escape sequence";
|
||||||
|
|
||||||
buffer_append_c (output, c);
|
buffer_append_c (output, c);
|
||||||
lexer_advance (self);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -305,9 +277,9 @@ lexer_string (struct lexer *self, struct buffer *output) {
|
||||||
unsigned char c;
|
unsigned char c;
|
||||||
const char *e = NULL;
|
const char *e = NULL;
|
||||||
while (self->len) {
|
while (self->len) {
|
||||||
if ((c = lexer_advance (self)) == '\'')
|
if ((c = lexer_advance (self)) == LEXER_STRING_QUOTE)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (c != '\\')
|
if (c != LEXER_ESCAPE)
|
||||||
buffer_append_c (output, c);
|
buffer_append_c (output, c);
|
||||||
else if ((e = lexer_escape_sequence (self, output)))
|
else if ((e = lexer_escape_sequence (self, output)))
|
||||||
return e;
|
return e;
|
||||||
|
@ -315,10 +287,15 @@ lexer_string (struct lexer *self, struct buffer *output) {
|
||||||
return "premature end of string";
|
return "premature end of string";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static enum token lexer_tokens[256] = {
|
||||||
|
['('] = T_LPAREN, [')'] = T_RPAREN, ['['] = T_LBRACKET, [']'] = T_RBRACKET,
|
||||||
|
['{'] = T_LBRACE, ['}'] = T_RBRACE, [';'] = T_NEWLINE, ['\n'] = T_NEWLINE,
|
||||||
|
['@'] = T_AT, [LEXER_STRING_QUOTE] = T_STRING,
|
||||||
|
};
|
||||||
|
|
||||||
static enum token
|
static enum token
|
||||||
lexer_next (struct lexer *self, const char **e) {
|
lexer_next (struct lexer *self, const char **e) {
|
||||||
// Skip over any whitespace between tokens
|
while (self->len && lexer_is_whitespace (*self->p))
|
||||||
while (self->len && lexer_is_ignored (*self->p))
|
|
||||||
lexer_advance (self);
|
lexer_advance (self);
|
||||||
if (!self->len)
|
if (!self->len)
|
||||||
return T_ABORT;
|
return T_ABORT;
|
||||||
|
@ -326,37 +303,27 @@ lexer_next (struct lexer *self, const char **e) {
|
||||||
free (self->string.s);
|
free (self->string.s);
|
||||||
self->string = (struct buffer) BUFFER_INITIALIZER;
|
self->string = (struct buffer) BUFFER_INITIALIZER;
|
||||||
|
|
||||||
switch (*self->p) {
|
unsigned char c = lexer_advance (self);
|
||||||
case '(': lexer_advance (self); return T_LPAREN;
|
if (c == LEXER_COMMENT) {
|
||||||
case ')': lexer_advance (self); return T_RPAREN;
|
|
||||||
case '[': lexer_advance (self); return T_LBRACKET;
|
|
||||||
case ']': lexer_advance (self); return T_RBRACKET;
|
|
||||||
case '{': lexer_advance (self); return T_LBRACE;
|
|
||||||
case '}': lexer_advance (self); return T_RBRACE;
|
|
||||||
case '\n': lexer_advance (self); return T_NEWLINE;
|
|
||||||
case ';': lexer_advance (self); return T_NEWLINE;
|
|
||||||
case '@': lexer_advance (self); return T_AT;
|
|
||||||
|
|
||||||
case '#':
|
|
||||||
// Comments go until newline
|
|
||||||
while (self->len)
|
while (self->len)
|
||||||
if (lexer_advance (self) == '\n')
|
if (lexer_advance (self) == '\n')
|
||||||
return T_NEWLINE;
|
return T_NEWLINE;
|
||||||
return T_ABORT;
|
return T_ABORT;
|
||||||
|
|
||||||
case '\'':
|
|
||||||
lexer_advance (self);
|
|
||||||
if ((*e = lexer_string (self, &self->string)))
|
|
||||||
return T_ABORT;
|
|
||||||
return T_STRING;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert (lexer_is_word_char (*self->p));
|
enum token token = lexer_tokens[c];
|
||||||
do
|
if (!token) {
|
||||||
|
buffer_append_c (&self->string, c);
|
||||||
|
while (self->len && !lexer_is_whitespace (*self->p)
|
||||||
|
&& !lexer_tokens[(unsigned char) *self->p])
|
||||||
buffer_append_c (&self->string, lexer_advance (self));
|
buffer_append_c (&self->string, lexer_advance (self));
|
||||||
while (lexer_is_word_char (*self->p));
|
|
||||||
return T_STRING;
|
return T_STRING;
|
||||||
}
|
}
|
||||||
|
if (token == T_STRING
|
||||||
|
&& (*e = lexer_string (self, &self->string)))
|
||||||
|
return T_ABORT;
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
static char *lexer_errorf (struct lexer *self, const char *fmt, ...)
|
static char *lexer_errorf (struct lexer *self, const char *fmt, ...)
|
||||||
ATTRIBUTE_PRINTF (2, 3);
|
ATTRIBUTE_PRINTF (2, 3);
|
||||||
|
@ -371,7 +338,7 @@ lexer_errorf (struct lexer *self, const char *fmt, ...) {
|
||||||
if (!description)
|
if (!description)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
char *e = format ("near line %u, column %u: %s",
|
char *e = format ("at or before line %u, column %u: %s",
|
||||||
self->line + 1, self->column + 1, description);
|
self->line + 1, self->column + 1, description);
|
||||||
free (description);
|
free (description);
|
||||||
return e;
|
return e;
|
||||||
|
|
Loading…
Reference in New Issue