Simplify the lexer

This commit is contained in:
Přemysl Eric Janouch 2017-05-24 20:03:17 +02:00
parent 383c9d8fd2
commit c5cd74d910
Signed by: p
GPG Key ID: B715679E3A361BE6
1 changed files with 41 additions and 74 deletions

115
ell.c
View File

@ -23,7 +23,6 @@
#include <ctype.h> #include <ctype.h>
#include <errno.h> #include <errno.h>
#include <stdarg.h> #include <stdarg.h>
#include <assert.h>
#include <stdbool.h> #include <stdbool.h>
#include <setjmp.h> #include <setjmp.h>
@ -215,9 +214,7 @@ struct lexer {
/// Input has to be null-terminated anyway /// Input has to be null-terminated anyway
static void static void
lexer_init (struct lexer *self, const char *p, size_t len) { lexer_init (struct lexer *self, const char *p, size_t len) {
memset (self, 0, sizeof *self); *self = (struct lexer) { .p = p, .len = len };
self->p = p;
self->len = len;
} }
static void static void
@ -225,11 +222,6 @@ lexer_free (struct lexer *self) {
free (self->string.s); free (self->string.s);
} }
static bool lexer_is_ignored (int c) { return strchr (" \t", c); }
static bool lexer_is_word_char (int c) {
return !lexer_is_ignored (c) && !strchr ("()[]{}\n;@#'", c);
}
static int static int
lexer_advance (struct lexer *self) { lexer_advance (struct lexer *self) {
int c = *self->p++; int c = *self->p++;
@ -245,58 +237,38 @@ lexer_advance (struct lexer *self) {
static bool static bool
lexer_hexa_escape (struct lexer *self, struct buffer *output) { lexer_hexa_escape (struct lexer *self, struct buffer *output) {
int i; const char *alphabet = "0123456789abcdef", *h, *l;
unsigned char code = 0; if (!self->len || !(h = strchr (alphabet, tolower (lexer_advance (self))))
|| !self->len || !(l = strchr (alphabet, tolower (lexer_advance (self)))))
for (i = 0; self->len && i < 2; i++) {
unsigned char c = tolower (*self->p);
if (c >= '0' && c <= '9')
code = (code << 4) | (c - '0');
else if (c >= 'a' && c <= 'f')
code = (code << 4) | (c - 'a' + 10);
else
break;
lexer_advance (self);
}
if (!i)
return false; return false;
buffer_append_c (output, code); buffer_append_c (output, (h - alphabet) << 4 | (l - alphabet));
return true; return true;
} }
enum { LEXER_STRING_QUOTE = '\'', LEXER_ESCAPE = '\\', LEXER_COMMENT = '#' };
static bool lexer_is_whitespace (int c) { return !c || c == ' ' || c == '\t'; }
static unsigned char lexer_escapes[256] = {
[LEXER_STRING_QUOTE] = LEXER_STRING_QUOTE, [LEXER_ESCAPE] = LEXER_ESCAPE,
['a'] = '\a', ['b'] = '\b', ['n'] = '\n', ['r'] = '\r', ['t'] = '\t',
};
static const char * static const char *
lexer_escape_sequence (struct lexer *self, struct buffer *output) { lexer_escape_sequence (struct lexer *self, struct buffer *output) {
if (!self->len) if (!self->len)
return "premature end of escape sequence"; return "premature end of escape sequence";
unsigned char c = *self->p; unsigned char c = lexer_advance (self);
switch (c) { if (c == 'x') {
case '"': break;
case '\\': break;
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
case 'x':
case 'X':
lexer_advance (self);
if (lexer_hexa_escape (self, output)) if (lexer_hexa_escape (self, output))
return NULL; return NULL;
return "invalid hexadecimal escape"; return "invalid hexadecimal escape";
default:
return "unknown escape sequence";
} }
if (!(c = lexer_escapes[c]))
return "unknown escape sequence";
buffer_append_c (output, c); buffer_append_c (output, c);
lexer_advance (self);
return NULL; return NULL;
} }
@ -305,9 +277,9 @@ lexer_string (struct lexer *self, struct buffer *output) {
unsigned char c; unsigned char c;
const char *e = NULL; const char *e = NULL;
while (self->len) { while (self->len) {
if ((c = lexer_advance (self)) == '\'') if ((c = lexer_advance (self)) == LEXER_STRING_QUOTE)
return NULL; return NULL;
if (c != '\\') if (c != LEXER_ESCAPE)
buffer_append_c (output, c); buffer_append_c (output, c);
else if ((e = lexer_escape_sequence (self, output))) else if ((e = lexer_escape_sequence (self, output)))
return e; return e;
@ -315,10 +287,15 @@ lexer_string (struct lexer *self, struct buffer *output) {
return "premature end of string"; return "premature end of string";
} }
static enum token lexer_tokens[256] = {
['('] = T_LPAREN, [')'] = T_RPAREN, ['['] = T_LBRACKET, [']'] = T_RBRACKET,
['{'] = T_LBRACE, ['}'] = T_RBRACE, [';'] = T_NEWLINE, ['\n'] = T_NEWLINE,
['@'] = T_AT, [LEXER_STRING_QUOTE] = T_STRING,
};
static enum token static enum token
lexer_next (struct lexer *self, const char **e) { lexer_next (struct lexer *self, const char **e) {
// Skip over any whitespace between tokens while (self->len && lexer_is_whitespace (*self->p))
while (self->len && lexer_is_ignored (*self->p))
lexer_advance (self); lexer_advance (self);
if (!self->len) if (!self->len)
return T_ABORT; return T_ABORT;
@ -326,36 +303,26 @@ lexer_next (struct lexer *self, const char **e) {
free (self->string.s); free (self->string.s);
self->string = (struct buffer) BUFFER_INITIALIZER; self->string = (struct buffer) BUFFER_INITIALIZER;
switch (*self->p) { unsigned char c = lexer_advance (self);
case '(': lexer_advance (self); return T_LPAREN; if (c == LEXER_COMMENT) {
case ')': lexer_advance (self); return T_RPAREN;
case '[': lexer_advance (self); return T_LBRACKET;
case ']': lexer_advance (self); return T_RBRACKET;
case '{': lexer_advance (self); return T_LBRACE;
case '}': lexer_advance (self); return T_RBRACE;
case '\n': lexer_advance (self); return T_NEWLINE;
case ';': lexer_advance (self); return T_NEWLINE;
case '@': lexer_advance (self); return T_AT;
case '#':
// Comments go until newline
while (self->len) while (self->len)
if (lexer_advance (self) == '\n') if (lexer_advance (self) == '\n')
return T_NEWLINE; return T_NEWLINE;
return T_ABORT; return T_ABORT;
case '\'':
lexer_advance (self);
if ((*e = lexer_string (self, &self->string)))
return T_ABORT;
return T_STRING;
} }
assert (lexer_is_word_char (*self->p)); enum token token = lexer_tokens[c];
do if (!token) {
buffer_append_c (&self->string, lexer_advance (self)); buffer_append_c (&self->string, c);
while (lexer_is_word_char (*self->p)); while (self->len && !lexer_is_whitespace (*self->p)
return T_STRING; && !lexer_tokens[(unsigned char) *self->p])
buffer_append_c (&self->string, lexer_advance (self));
return T_STRING;
}
if (token == T_STRING
&& (*e = lexer_string (self, &self->string)))
return T_ABORT;
return token;
} }
static char *lexer_errorf (struct lexer *self, const char *fmt, ...) static char *lexer_errorf (struct lexer *self, const char *fmt, ...)
@ -371,7 +338,7 @@ lexer_errorf (struct lexer *self, const char *fmt, ...) {
if (!description) if (!description)
return NULL; return NULL;
char *e = format ("near line %u, column %u: %s", char *e = format ("at or before line %u, column %u: %s",
self->line + 1, self->column + 1, description); self->line + 1, self->column + 1, description);
free (description); free (description);
return e; return e;