Browse Source

Initial commit

master
Přemysl Janouch 3 years ago
commit
128fb157b3
Signed by: Přemysl Janouch <p.janouch@gmail.com> GPG Key ID: B715679E3A361BE6

+ 15
- 0
LICENSE View File

@@ -0,0 +1,15 @@
Copyright (c) 2016, Přemysl Janouch <p.janouch@gmail.com>
All rights reserved.
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


+ 12
- 0
Makefile View File

@@ -0,0 +1,12 @@
# All we need is C99 and POSIX, which this should make available
CFLAGS = -std=gnu99
NAMES = bfc-amd64-linux

all: $(NAMES)

%: %.c
$(CC) $(CPPFLAGS) $(CFLAGS) $< -o $@
clean:
rm -f $(NAMES)

.PHONY: all clean

+ 49
- 0
README.adoc View File

@@ -0,0 +1,49 @@
bfc
===

'bfc' is a small, fast, self-contained, optimizing Brainfuck compiler for Linux
on Intel x86-64.

Also included are several interpreters in various states of sophistication that
document my progress as I was writing this, from the simplest approach to an
optimizing JIT compiler.

It's pretty easy to retarget the compiler, it just means redoing half the work.
The compiler itself is platform agnostic.

Building
--------
Build dependencies: a C99 compiler +
Runtime dependencies: Linux

$ git clone https://github.com/pjanouch/bfc.git
$ cd bfc
$ make

To obtain dumps of the intermediate representation, compile with `-DDEBUG`:

$ make CPPFLAGS=-DDEBUG

Usage
-----

./bfc-amd64-linux [INPUT-FILE] [OUTPUT-FILE]

When no input file is specified, stdin is used. Similarly, the default output
filename is a.out. The resulting file can be run on the target platform.

Contributing and Support
------------------------
Use this project's GitHub to report any bugs, request features, or submit pull
requests. If you want to discuss this project, or maybe just hang out with
the developer, feel free to join me at irc://irc.janouch.name, channel #dev.

License
-------
'bfc' is written by Přemysl Janouch <p.janouch@gmail.com>.

You may use the software under the terms of the ISC license, the text of which
is included within the package, or, at your option, you may relicense the work
under the MIT or the Modified BSD License, as listed at the following site:

http://www.gnu.org/licenses/license-list.html

+ 723
- 0
bfc-amd64-linux.c View File

@@ -0,0 +1,723 @@
// This is an exercise in futility more than anything else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <errno.h>

#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)

// --- Safe memory management --------------------------------------------------

static void *
xcalloc (size_t m, size_t n)
{
void *p = calloc (m, n);
if (!p)
exit_fatal ("calloc: %s\n", strerror (errno));
return p;
}

static void *
xrealloc (void *o, size_t n)
{
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
}

// --- Dynamically allocated strings -------------------------------------------

struct str
{
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
};

static void
str_init (struct str *self)
{
self->len = 0;
self->str = xcalloc (1, (self->alloc = 16));
}

static void
str_ensure_space (struct str *self, size_t n)
{
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
}

static void
str_append_data (struct str *self, const void *data, size_t n)
{
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
}

static void
str_append_c (struct str *self, char c)
{
str_append_data (self, &c, 1);
}

// --- Application -------------------------------------------------------------

enum command
{
RIGHT, LEFT, INC, DEC, IN, OUT, BEGIN, END,
SET, EAT, INCACC, DECACC
};

bool grouped[] = { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
struct instruction { enum command cmd; int offset; size_t arg; };
#define INSTRUCTION(c, o, a) (struct instruction) { (c), (o), (a) }

// - - Debugging - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#ifdef DEBUG
static void
debug_dump_instruction (FILE *fp, const struct instruction *in)
{
const char *name;
switch (in->cmd)
{
case RIGHT: name = "RIGHT "; break;
case LEFT: name = "LEFT "; break;
case INC: name = "INC "; break;
case DEC: name = "DEC "; break;
case OUT: name = "OUT "; break;
case IN: name = "IN "; break;
case BEGIN: name = "BEGIN "; break;
case END: name = "END "; break;
case SET: name = "SET "; break;
case EAT: name = "EAT "; break;
case INCACC: name = "INCACC"; break;
case DECACC: name = "DECACC"; break;
}
fprintf (fp, "%s %zu", name, in->arg);
if (in->offset != 0)
fprintf (fp, " [%d]", in->offset);
fprintf (fp, "\n");
}

static void
debug_dump (const char *filename, struct instruction *in, size_t len)
{
FILE *fp = fopen (filename, "w");
long indent = 0;
for (size_t i = 0; i < len; i++)
{
if (in[i].cmd == END)
indent--;
for (long k = 0; k < indent; k++)
fputs (" ", fp);
debug_dump_instruction (fp, &in[i]);
if (in[i].cmd == BEGIN)
indent++;
}
fclose (fp);
}
#else
#define debug_dump(...)
#endif

// - - Optimization passes - - - - - - - - - - - - - - - - - - - - - - - - - - -

static size_t
optimize_assignment (struct instruction *irb, size_t irb_len)
{
size_t in = 0, out = 0;
for (; in < irb_len; in++, out++)
{
if (in + 2 < irb_len
&& irb[in ].cmd == BEGIN
&& irb[in + 1].cmd == DEC && irb[in + 1].arg == 1
&& irb[in + 2].cmd == END)
{
irb[out] = INSTRUCTION (SET, 0, 0);
in += 2;
}
else if (out && irb[out - 1].cmd == SET && irb[in].cmd == INC)
irb[--out].arg += irb[in].arg;
else if (out != in)
irb[out] = irb[in];
}
return out;
}

// Add offsets to INC/DEC/SET stuck between LEFT/RIGHT
// and compress the LEFT/RIGHT sequences
static size_t
optimize_offseted_inc_dec (struct instruction *irb, size_t irb_len)
{
size_t in = 0, out = 0;
for (in = 0, out = 0; in < irb_len; in++, out++)
{
intptr_t dir = 0;
if (irb[in].cmd == RIGHT)
dir = irb[in].arg;
else if (irb[in].cmd == LEFT)
dir = -(intptr_t) irb[in].arg;
else
{
irb[out] = irb[in];
continue;
}

while (in + 2 < irb_len)
{
// An immediate offset has its limits on x86-64
if (dir < INT8_MIN || dir > INT8_MAX)
break;

intptr_t diff;
if (irb[in + 2].cmd == RIGHT)
diff = irb[in + 2].arg;
else if (irb[in + 2].cmd == LEFT)
diff = -(intptr_t) irb[in + 2].arg;
else
break;

int cmd = irb[in + 1].cmd;
if (cmd != INC && cmd != DEC && cmd != SET)
break;

irb[out] = irb[in + 1];
irb[out].offset = dir;

dir += diff;
out += 1;
in += 2;
}

for (; in + 1 < irb_len; in++)
{
if (irb[in + 1].cmd == RIGHT)
dir += irb[in + 1].arg;
else if (irb[in + 1].cmd == LEFT)
dir -= (intptr_t) irb[in + 1].arg;
else
break;
}

if (!dir)
out--;
else if (dir > 0)
irb[out] = INSTRUCTION (RIGHT, 0, dir);
else
irb[out] = INSTRUCTION (LEFT, 0, -dir);
}
return out;
}

// Try to eliminate loops that eat a cell and add/subtract its value
// to/from some other cell
static size_t
optimize_inc_dec_loops (struct instruction *irb, size_t irb_len)
{
size_t in = 0, out = 0;
for (in = 0, out = 0; in < irb_len; in++, out++)
{
irb[out] = irb[in];
if (irb[in].cmd != BEGIN)
continue;

bool ok = false;
size_t count = 0;
for (size_t k = in + 1; k < irb_len; k++)
{
if (irb[k].cmd == END)
{
ok = true;
break;
}
if (irb[k].cmd != INC
&& irb[k].cmd != DEC)
break;
count++;
}
if (!ok)
continue;

// Stable sort operations by their offsets, put [0] first
bool sorted;
do
{
sorted = true;
for (size_t k = 1; k < count; k++)
{
if (irb[in + k].offset == 0)
continue;
if (irb[in + k + 1].offset != 0
&& irb[in + k].offset <= irb[in + k + 1].offset)
continue;

struct instruction tmp = irb[in + k + 1];
irb[in + k + 1] = irb[in + k];
irb[in + k] = tmp;
sorted = false;
}
}
while (!sorted);

// Abort the optimization on duplicate offsets (complication with [0])
for (size_t k = 1; k < count; k++)
if (irb[in + k].offset == irb[in + k + 1].offset)
ok = false;
// XXX: can't make the code longer either
for (size_t k = 1; k <= count; k++)
if (irb[in + k].arg != 1)
ok = false;
if (!ok
|| irb[in + 1].cmd != DEC
|| irb[in + 1].offset != 0)
continue;

int min_safe_left_offset = 0;
if (in > 1 && irb[in - 1].cmd == RIGHT)
min_safe_left_offset = -irb[in - 1].arg;

bool cond_needed_for_safety = false;
for (size_t k = 0; k < count; k++)
if (irb[in + k + 1].offset < min_safe_left_offset)
{
cond_needed_for_safety = true;
break;
}

in++;
if (cond_needed_for_safety)
out++;

irb[out] = INSTRUCTION (EAT, 0, 0);
for (size_t k = 1; k < count; k++)
irb[out + k] = INSTRUCTION (irb[in + k].cmd == INC
? INCACC : DECACC, irb[in + k].offset, 0);

in += count;
out += count;

if (cond_needed_for_safety)
irb[out] = INSTRUCTION (END, 0, 0);
else
out--;
}
return out;
}

// - - Loop pairing - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

static void
pair_loops (struct instruction *irb, size_t irb_len)
{
size_t nesting = 0;
size_t *stack = xcalloc (sizeof *stack, irb_len);
for (size_t i = 0; i < irb_len; i++)
{
switch (irb[i].cmd)
{
case BEGIN:
stack[nesting++] = i;
break;
case END:
if (nesting <= 0)
exit_fatal ("unbalanced loops\n");

--nesting;
irb[stack[nesting]].arg = i + 1;

// Looping can be disabled by optimizations
if (irb[i].arg)
irb[i].arg = stack[nesting] + 1;
default:
break;
}
}
free (stack);

if (nesting != 0)
exit_fatal ("unbalanced loops\n");
}

// - - Main - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

int
main (int argc, char *argv[])
{
if (argc > 3)
exit_fatal ("usage: %s [INPUT-FILE]\n", argv[0]);

FILE *input_file = stdin;
if (argc > 1 && !(input_file = fopen (argv[1], "r")))
exit_fatal ("fopen: %s: %s\n", argv[1], strerror (errno));

const char *output_path = "a.out";
if (argc > 2)
output_path = argv[2];

struct str buffer;
str_init (&buffer);

int c;
while ((c = fgetc (input_file)) != EOF)
str_append_c (&buffer, c);
if (ferror (input_file))
exit_fatal ("can't read program\n");
fclose (input_file);

// - - Decode, group and optimize - - - - - - - - - - - - - - - - - - - - - - -

// This is our Intermediate Representation Buffer
struct instruction *irb = xcalloc (sizeof *irb, buffer.len);
size_t irb_len = 0;

for (size_t i = 0; i < buffer.len; i++)
{
enum command cmd;
switch (buffer.str[i])
{
case '>': cmd = RIGHT; break;
case '<': cmd = LEFT; break;
case '+': cmd = INC; break;
case '-': cmd = DEC; break;
case '.': cmd = OUT; break;
case ',': cmd = IN; break;
case '[': cmd = BEGIN; break;
case ']': cmd = END; break;
default: continue;
}

// The most basic optimization is to group identical commands together
if (!irb_len || !grouped[cmd] || irb[irb_len - 1].cmd != cmd)
irb_len++;

irb[irb_len - 1].cmd = cmd;
irb[irb_len - 1].arg++;
}

debug_dump ("bf-no-opt.txt", irb, irb_len);
irb_len = optimize_assignment (irb, irb_len);
debug_dump ("bf-pre-offsets.txt", irb, irb_len);
irb_len = optimize_offseted_inc_dec (irb, irb_len);
debug_dump ("bf-pre-incdec-unloop.txt", irb, irb_len);
irb_len = optimize_inc_dec_loops (irb, irb_len);
debug_dump ("bf-optimized.txt", irb, irb_len);
pair_loops (irb, irb_len);
debug_dump ("bf-final.txt", irb, irb_len);

// - - Code generation - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

str_init (&buffer);
size_t *offsets = xcalloc (sizeof *offsets, irb_len + 1);
bool *sets_flags = xcalloc (sizeof *sets_flags, irb_len);

#define CODE(x) { char t[] = x; str_append_data (&buffer, t, sizeof t - 1); }
#define LE(v) (uint8_t[]) { v, v>>8, v>>16, v>>24, v>>32, v>>40, v>>48, v>>56 }
#define DB(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 1); }
#define DW(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 2); }
#define DD(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 4); }
#define DQ(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 8); }

enum
{
ELF_LOAD_CODE = 0x400000, // where code is loaded (usual)
ELF_LOAD_DATA = 0x800000 // where the tape is placed
};

CODE ("\xB8") DD (ELF_LOAD_DATA) // mov rax, "ELF_LOAD_DATA"
CODE ("\x30\xDB") // xor bl, bl

for (size_t i = 0; i < irb_len; i++)
{
offsets[i] = buffer.len;

size_t arg = irb[i].arg;
assert (arg <= UINT32_MAX);

int offset = irb[i].offset;
assert (offset <= INT8_MAX && offset >= INT8_MIN);

// Don't save what we've just loaded
if (irb[i].cmd == LEFT || irb[i].cmd == RIGHT)
if (i < 2 || i + 1 >= irb_len
|| (irb[i - 2].cmd != LEFT && irb[i - 2].cmd != RIGHT)
|| irb[i - 1].cmd != BEGIN
|| irb[i + 1].cmd != END)
CODE ("\x88\x18") // mov [rax], bl

switch (irb[i].cmd)
{
case RIGHT:
// add rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX) { CODE ("\x48\x05") DD (arg) }
else { CODE ("\x48\x83\xC0") DB (arg) }
break;
case LEFT:
// sub rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX) { CODE ("\x48\x2D") DD (arg) }
else { CODE ("\x48\x83\xE8") DB (arg) }
break;

case EAT:
// NOTE: the kernel destroys rcx and r11 on syscalls,
// there must be no OUT or IN between EAT and INCACC/DECACC
CODE ("\x88\xD9" "\x30\xDB") // mov cl, bl; xor bl, bl
sets_flags[i] = true;
break;
case INCACC:
if (offset)
{
CODE ("\x00\x48") DB (offset) // add [rax+"offset"], cl
}
else
{
CODE ("\x00\xCB") // add bl, cl
sets_flags[i] = true;
}
break;
case DECACC:
if (offset)
{
CODE ("\x28\x48") DB (offset) // sub [rax+"offset"], cl
}
else
{
CODE ("\x28\xCB") // sub bl, cl
sets_flags[i] = true;
}
break;

case INC:
if (offset)
{
CODE ("\x80\x40") DB (offset) // add byte [rax+"offset"], "arg"
}
else
{
CODE ("\x80\xC3") // add bl, "arg"
sets_flags[i] = true;
}
DB (arg)
break;
case DEC:
if (offset)
{
CODE ("\x80\x68") DB (offset) // sub byte [rax+"offset"], "arg"
}
else
{
CODE ("\x80\xEB") // sub bl, "arg"
sets_flags[i] = true;
}
DB (arg)
break;
case SET:
if (offset)
{
CODE ("\xC6\x40") DB (offset) // mov byte [rax+"offset"], "arg"
}
else
CODE ("\xB3") // mov bl, "arg"
DB (arg)
break;

case OUT:
CODE ("\xE8") DD (0) // call "write"
break;
case IN:
CODE ("\xE8") DD (0) // call "read"
break;

case BEGIN:
// Don't test the register when the flag has been set already;
// this doesn't have much of an effect in practice
if (!i || !sets_flags[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x84\x00\x00\x00\x00") // jz "offsets[arg]"
break;
case END:
// We know that the cell is zero, make this an "if", not a "loop";
// this doesn't have much of an effect in practice
if (!arg)
break;

if (!i || !sets_flags[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x85\x00\x00\x00\x00") // jnz "offsets[arg]"
break;
}

// No sense in reading it out when we overwrite it immediately;
// this doesn't have much of an effect in practice
if (irb[i].cmd == LEFT || irb[i].cmd == RIGHT)
if (i + 1 >= irb_len
|| irb[i + 1].cmd != SET
|| irb[i + 1].offset != 0)
CODE ("\x8A\x18") // mov bl, [rax]
}
// When there is a loop at the end we need to be able to jump past it
offsets[irb_len] = buffer.len;

// Write an epilog which handles all the OS interfacing
//
// System V x86-64 ABI:
// rax <-> both syscall number and return value
// args -> rdi, rsi, rdx, r10, r8, r9
// trashed <- rcx, r11

enum { SYS_READ = 0, SYS_WRITE = 1, SYS_EXIT = 60 };

CODE ("\xB8") DD (SYS_EXIT) // mov eax, 0x3c
CODE ("\x48\x31\xFF") // xor rdi, rdi
CODE ("\x0F\x05") // syscall

size_t fatal_offset = buffer.len;
CODE ("\x48\x89\xF7") // mov rdi, rsi -- use the string in rsi
CODE ("\x30\xC0") // xor al, al -- look for the nil byte
CODE ("\x48\x31\xC9") // xor rcx, rcx
CODE ("\x48\xF7\xD1") // not rcx -- start from -1
CODE ("\xFC" "\xF2\xAE") // cld; repne scasb -- decrement until found
CODE ("\x48\xF7\xD1") // not rcx
CODE ("\x48\x8D\x51\xFF") // lea rdx, [rcx-1] -- save length in rdx
CODE ("\xB8") DD (SYS_WRITE) // mov eax, "SYS_WRITE"
CODE ("\xBF") DD (2) // mov edi, "STDERR_FILENO"
CODE ("\x0F\x05") // syscall

CODE ("\xB8") DD (SYS_EXIT) // mov eax, "SYS_EXIT"
CODE ("\xBF") DD (1) // mov edi, "EXIT_FAILURE"
CODE ("\x0F\x05") // syscall

size_t read_offset = buffer.len;
CODE ("\x50") // push rax -- save tape position
CODE ("\xB8") DD (SYS_READ) // mov eax, "SYS_READ"
CODE ("\x48\x89\xC7") // mov rdi, rax -- STDIN_FILENO
CODE ("\x66\x6A\x00") // push word 0 -- the default value for EOF
CODE ("\x48\x89\xE6") // mov rsi, rsp -- the char starts at rsp
CODE ("\xBA") DD (1) // mov edx, 1 -- count
CODE ("\x0F\x05") // syscall
CODE ("\x66\x5B") // pop bx

CODE ("\x48\x83\xF8\x00") // cmp rax, 0
CODE ("\x48\x8D\x35") DD (4) // lea rsi, [rel read_message]
CODE ("\x7C") // jl "fatal_offset" -- write failure message
DB ((intptr_t) fatal_offset - (intptr_t) (buffer.len + 1))
CODE ("\x58") // pop rax -- restore tape position
CODE ("\xC3") // ret
CODE ("fatal: read failed\n\0")

size_t write_offset = buffer.len;
CODE ("\x50") // push rax -- save tape position
CODE ("\xB8") DD (SYS_WRITE) // mov eax, "SYS_WRITE"
CODE ("\x48\x89\xC7") // mov rdi, rax -- STDOUT_FILENO
CODE ("\x66\x53") // push bx
CODE ("\x48\x89\xE6") // mov rsi, rsp -- the char starts at rsp
CODE ("\xBA") DD (1) // mov edx, 1 -- count
CODE ("\x0F\x05") // syscall
CODE ("\x66\x5B") // pop bx

CODE ("\x48\x83\xF8\x00") // cmp rax, 0
CODE ("\x48\x8D\x35") DD (4) // lea rsi, [rel write_message]
CODE ("\x7C") // jl "fatal_offset" -- write failure message
DB ((intptr_t) fatal_offset - (intptr_t) (buffer.len + 1))
CODE ("\x58") // pop rax -- restore tape position
CODE ("\xC3") // ret
CODE ("fatal: write failed\n\0")

// Now that we know where each instruction is, fill in relative jumps
for (size_t i = 0; i < irb_len; i++)
{
if (!irb[i].arg)
continue;

// This must accurately reflect the code generators
intptr_t target, fixup = offsets[i];
if (irb[i].cmd == BEGIN || irb[i].cmd == END)
{
fixup += (i && sets_flags[i - 1]) ? 2 : 4;
target = offsets[irb[i].arg];
}
else if (irb[i].cmd == IN) { fixup++; target = read_offset; }
else if (irb[i].cmd == OUT) { fixup++; target = write_offset; }
else continue;

uint64_t v = target - (fixup + 4);
memcpy (buffer.str + fixup, LE (v), 4);
}
free (offsets);
free (sets_flags);

// - - Output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Now that we know how long the machine code is, we can write the header.
// Note that for PIE we would need to depend on the dynamic linker, so no.
//
// Recommended reading:
// http://www.muppetlabs.com/~breadbox/software/tiny/teensy.html
// man 5 elf

struct str code = buffer;
str_init (&buffer);

enum
{
ELF_HEADER_SIZE = 64, // size of the ELF header
ELF_PROGRAM_ENTRY_SIZE = 56, // size of a program header
ELF_META_SIZE = ELF_HEADER_SIZE + 2 * ELF_PROGRAM_ENTRY_SIZE
};

// ELF header
CODE ("\x7F" "ELF\x02\x01\x01") // ELF, 64-bit, little endian, v1
CODE ("\x00\x00" "\0\0\0\0\0\0\0") // Unix System V ABI, v0, padding
DW (2) DW (62) DD (1) // executable, x86-64, v1
DQ (ELF_LOAD_CODE + ELF_META_SIZE) // entry point address
DQ (ELF_HEADER_SIZE) DQ (0) // program, section header offset
DD (0) // no processor-specific flags
DW (ELF_HEADER_SIZE) // ELF header size
DW (ELF_PROGRAM_ENTRY_SIZE) DW (2) // program hdr tbl entry size, count
DW (0) DW (0) // section hdr tbl entry size, count
DW (0) // no section index for strings

// Program header for code
// The entry point address seems to require alignment, so map start of file
DD (1) DD (5) // PT_LOAD, PF_R | PF_X
DQ (0) // offset within the file
DQ (ELF_LOAD_CODE) // address in virtual memory
DQ (ELF_LOAD_CODE) // address in physical memory
DQ (code.len + ELF_META_SIZE) // length within the file
DQ (code.len + ELF_META_SIZE) // length within memory
DQ (4096) // segment alignment

// Program header for the tape
DD (1) DD (6) // PT_LOAD, PF_R | PF_W
DQ (0) // offset within the file
DQ (ELF_LOAD_DATA) // address in virtual memory
DQ (ELF_LOAD_DATA) // address in physical memory
DQ (0) // length within the file
DQ (1 << 20) // one megabyte of memory
DQ (4096) // segment alignment

// The section header table is optional and we don't need it for anything

FILE *output_file;
if (!(output_file = fopen (output_path, "w")))
exit_fatal ("fopen: %s: %s\n", output_path, strerror (errno));

fwrite (buffer.str, buffer.len, 1, output_file);
fwrite (code.str, code.len, 1, output_file);
fclose (output_file);
return 0;
}

+ 14
- 0
interpreters/Makefile View File

@@ -0,0 +1,14 @@
CC = c99
CFLAGS = -O3

NAMES = bf bf-faster-loops bf-optimizing \
bf-jit bf-jit-opt bf-jit-unsafe bf-jit-unsafe-opt

all: $(NAMES)

%: %.c
$(CC) $(CPPFLAGS) $(CFLAGS) $< -o $@
clean:
rm -f $(NAMES)

.PHONY: all clean

+ 15
- 0
interpreters/README.adoc View File

@@ -0,0 +1,15 @@
This directory contains several Brainfuck interpreters in various states of
sophistication, from the simplest approach to an optimizing JIT compiler:

* `bf.c` is the stupidest one and the oldest by far
* `bf-faster-loops.c` precomputes loop jumps
* `bf-optimizing.c` improves on that by changing `[-]+` loops into assignments
* `bf-jit.c` adds JIT compilation for Intel x86-64
* `bf-jit-opt.c` tries a bit harder to avoid looping on the current value
* `bf-jit-unsafe.c` abolishes all boundary checks when moving across the tape
* `bf-jit-unsafe-opt.c` makes use of immediate offsets to modify values

I recommend using a tool such as 'meld' to view the differences.

Just run `make` in this directory to have them all built, and append
`CPPFLAGS=-DDEBUG` to get dumps of the IR for the more sophisticated JITs.

+ 151
- 0
interpreters/bf-faster-loops.c View File

@@ -0,0 +1,151 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <errno.h>

#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)

// --- Safe memory management --------------------------------------------------

static void *
xmalloc (size_t n)
{
void *p = malloc (n);
if (!p)
exit_fatal ("malloc: %s\n", strerror (errno));
return p;
}

static void *
xrealloc (void *o, size_t n)
{
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
}

// --- Dynamically allocated strings -------------------------------------------

struct str
{
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
};

static void
str_init (struct str *self)
{
self->alloc = 16;
self->len = 0;
self->str = strcpy (xmalloc (self->alloc), "");
}

static void
str_ensure_space (struct str *self, size_t n)
{
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
}

static void
str_append_data (struct str *self, const void *data, size_t n)
{
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
}

static void
str_append_c (struct str *self, char c)
{
str_append_data (self, &c, 1);
}

// --- Main --------------------------------------------------------------------

int
main (int argc, char *argv[])
{
struct str program; str_init (&program);
struct str data; str_init (&data);

int c;
while ((c = fgetc (stdin)) != EOF)
str_append_c (&program, c);
if (ferror (stdin))
exit_fatal ("can't read program\n");

FILE *input = fopen ("/dev/tty", "rb");
if (!input)
exit_fatal ("can't open terminal for reading\n");

size_t *pairs = xmalloc (sizeof *pairs * program.len);
size_t *stack = xmalloc (sizeof *stack * program.len);

size_t nesting = 0;
for (size_t i = 0; i < program.len; i++)
{
switch (program.str[i])
{
case '[':
stack[nesting++] = i;
break;
case ']':
assert (nesting > 0);

--nesting;
pairs[stack[nesting]] = i;
pairs[i] = stack[nesting];
}
}
assert (nesting == 0);

size_t dataptr = 0;
str_append_c (&data, 0);

for (size_t i = 0; i < program.len; i++)
{
switch (program.str[i])
{
case '>':
assert (dataptr != SIZE_MAX);
if (++dataptr == data.len)
str_append_c (&data, 0);
break;
case '<':
assert (dataptr != 0);
dataptr--;
break;

case '+': data.str[dataptr]++; break;
case '-': data.str[dataptr]--; break;

case '.':
fputc (data.str[dataptr], stdout);
break;
case ',':
data.str[dataptr] = c = fgetc (input);
assert (c != EOF);
break;

case '[': if (!data.str[dataptr]) i = pairs[i]; break;
case ']': if ( data.str[dataptr]) i = pairs[i]; break;

default:
break;
}
}
return 0;
}

+ 495
- 0
interpreters/bf-jit-opt.c View File

@@ -0,0 +1,495 @@
// This is an exercise in futility more than anything else
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <errno.h>

#if (defined __x86_64__ || defined __amd64__) && defined __unix__
#include <sys/mman.h>
#else
#error Platform not supported
#endif

#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)

// --- Safe memory management --------------------------------------------------

static void *
xcalloc (size_t m, size_t n)
{
void *p = calloc (m, n);
if (!p)
exit_fatal ("calloc: %s\n", strerror (errno));
return p;
}

static void *
xrealloc (void *o, size_t n)
{
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
}

// --- Dynamically allocated strings -------------------------------------------

struct str
{
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
};

static void
str_init (struct str *self)
{
self->len = 0;
self->str = xcalloc (1, (self->alloc = 16));
}

static void
str_ensure_space (struct str *self, size_t n)
{
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
}

static void
str_append_data (struct str *self, const void *data, size_t n)
{
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
}

static void
str_append_c (struct str *self, char c)
{
str_append_data (self, &c, 1);
}

// --- Application -------------------------------------------------------------

struct str data; ///< Data tape
volatile size_t dataptr; ///< Current location on the tape
FILE *input; ///< User input

enum command { RIGHT, LEFT, INC, DEC, SET, IN, OUT, BEGIN, END,
EAT, INCACC, DECACC };
bool grouped[] = { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
struct instruction { enum command cmd; size_t arg; };
#define INSTRUCTION(c, a) (struct instruction) { (c), (a) }

// - - Callbacks - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Some things I just really don't want to write in assembly even though it
// is effectively a big performance hit, eliminating the advantage of JIT

static void
right (size_t arg)
{
assert (SIZE_MAX - dataptr > arg);
dataptr += arg;

while (dataptr >= data.len)
str_append_c (&data, 0);
}

static void
left (size_t arg)
{
assert (dataptr >= arg);
dataptr -= arg;
}

static void
cin (void)
{
int c;
data.str[dataptr] = c = fgetc (input);
assert (c != EOF);
}

// - - Main - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#ifdef DEBUG
static void
debug_dump (const char *filename, struct instruction *in, size_t len)
{
FILE *fp = fopen (filename, "w");
long indent = 0;
for (size_t i = 0; i < len; i++)
{
if (in[i].cmd == END)
indent--;
for (long k = 0; k < indent; k++)
fprintf (fp, " ");

switch (in[i].cmd)
{
case RIGHT: fprintf (fp, "RIGHT %zu\n", in[i].arg); break;
case LEFT: fprintf (fp, "LEFT %zu\n", in[i].arg); break;
case INC: fprintf (fp, "INC %zu\n", in[i].arg); break;
case DEC: fprintf (fp, "DEC %zu\n", in[i].arg); break;
case OUT: fprintf (fp, "OUT %zu\n", in[i].arg); break;
case IN: fprintf (fp, "IN %zu\n", in[i].arg); break;
case BEGIN: fprintf (fp, "BEGIN %zu\n", in[i].arg); break;
case END: fprintf (fp, "END %zu\n", in[i].arg); break;
case SET: fprintf (fp, "SET %zu\n", in[i].arg); break;
case EAT: fprintf (fp, "EAT %zu\n", in[i].arg); break;
case INCACC: fprintf (fp, "INCACC %zu\n", in[i].arg); break;
case DECACC: fprintf (fp, "DECACC %zu\n", in[i].arg); break;
}
if (in[i].cmd == BEGIN)
indent++;
}
fclose (fp);
}
#else
#define debug_dump(...)
#endif

int
main (int argc, char *argv[])
{
(void) argc;
(void) argv;

struct str program;
str_init (&program);

int c;
while ((c = fgetc (stdin)) != EOF)
str_append_c (&program, c);
if (ferror (stdin))
exit_fatal ("can't read program\n");
if (!(input = fopen ("/dev/tty", "rb")))
exit_fatal ("can't open terminal for reading\n");

// - - Decode and group - - - - - - - - - - - - - - - - - - - - - - - - - - - -

struct instruction *parsed = xcalloc (sizeof *parsed, program.len);
size_t parsed_len = 0;

for (size_t i = 0; i < program.len; i++)
{
enum command cmd;
switch (program.str[i])
{
case '>': cmd = RIGHT; break;
case '<': cmd = LEFT; break;
case '+': cmd = INC; break;
case '-': cmd = DEC; break;
case '.': cmd = OUT; break;
case ',': cmd = IN; break;
case '[': cmd = BEGIN; break;
case ']': cmd = END; break;
default: continue;
}

// The most basic optimization is to group identical commands together
if (!parsed_len || !grouped[cmd] || parsed[parsed_len - 1].cmd != cmd)
parsed_len++;

parsed[parsed_len - 1].cmd = cmd;
parsed[parsed_len - 1].arg++;
}

// - - Optimization passes - - - - - - - - - - - - - - - - - - - - - - - - - - -

debug_dump ("bf-no-opt.txt", parsed, parsed_len);

size_t in = 0, out = 0;
for (; in < parsed_len; in++, out++)
{
// This shows up in mandelbrot.bf a lot but actually helps hanoi.bf
if (in + 5 < parsed_len
&& parsed[in].cmd == BEGIN && parsed[in + 5].cmd == END
&& parsed[in + 1].cmd == DEC && parsed[in + 1].arg == 1

&& parsed[in + 2].cmd == LEFT && parsed[in + 4].cmd == RIGHT
&& parsed[in + 2].arg == parsed[in + 4].arg

&& (parsed[in + 3].cmd == INC || parsed[in + 3].cmd == DEC)
&& parsed[in + 3].arg == 1)
{
// This mustn't make the move when the cell is zero already
parsed[out] = parsed[in];
parsed[out + 1] = INSTRUCTION (EAT, 0);
parsed[out + 2] = parsed[in + 2];
parsed[out + 3] = INSTRUCTION
(parsed[in + 3].cmd == INC ? INCACC : DECACC, 0);
parsed[out + 4] = parsed[in + 4];
// This disables the looping further in the code;
// this doesn't have much of an effect in practice
parsed[out + 5] = INSTRUCTION (END, 0);
in += 5;
out += 5;
}
// The simpler case that cannot crash and thus can avoid the loop
else if (in + 5 < parsed_len
&& parsed[in].cmd == BEGIN && parsed[in + 5].cmd == END
&& parsed[in + 1].cmd == DEC && parsed[in + 1].arg == 1

&& parsed[in + 2].cmd == RIGHT && parsed[in + 4].cmd == LEFT
&& parsed[in + 2].arg == parsed[in + 4].arg

&& (parsed[in + 3].cmd == INC || parsed[in + 3].cmd == DEC)
&& parsed[in + 3].arg == 1)
{
parsed[out] = INSTRUCTION (EAT, 0);
parsed[out + 1] = parsed[in + 2];
parsed[out + 2] = INSTRUCTION
(parsed[in + 3].cmd == INC ? INCACC : DECACC, 0);
parsed[out + 3] = parsed[in + 4];
in += 5;
out += 3;
}
else if (in + 2 < parsed_len
&& parsed[in ].cmd == BEGIN
&& parsed[in + 1].cmd == DEC && parsed[in + 1].arg == 1
&& parsed[in + 2].cmd == END)
{
parsed[out] = INSTRUCTION (SET, 0);
in += 2;
}
else if (out && parsed[out - 1].cmd == SET && parsed[in].cmd == INC)
parsed[--out].arg += parsed[in].arg;
else if (out != in)
parsed[out] = parsed[in];
}
parsed_len = out;

for (in = 0, out = 0; in < parsed_len; in++, out++)
{
ssize_t dir = 0;
if (parsed[in].cmd == RIGHT)
dir = parsed[in].arg;
else if (parsed[in].cmd == LEFT)
dir = -(ssize_t) parsed[in].arg;
else
{
parsed[out] = parsed[in];
continue;
}

for (; in + 1 < parsed_len; in++)
{
if (parsed[in + 1].cmd == RIGHT)
dir += parsed[in + 1].arg;
else if (parsed[in + 1].cmd == LEFT)
dir -= (ssize_t) parsed[in + 1].arg;
else
break;
}

if (!dir)
out--;
else if (dir > 0)
parsed[out] = INSTRUCTION (RIGHT, dir);
else
parsed[out] = INSTRUCTION (LEFT, -dir);
}
parsed_len = out;

debug_dump ("bf-optimized.txt", parsed, parsed_len);

// - - Loop pairing - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

size_t nesting = 0;
size_t *stack = xcalloc (sizeof *stack, parsed_len);
for (size_t i = 0; i < parsed_len; i++)
{
switch (parsed[i].cmd)
{
case BEGIN:
stack[nesting++] = i;
break;
case END:
assert (nesting > 0);

--nesting;
parsed[stack[nesting]].arg = i + 1;

// Looping can be disabled by optimizations
if (parsed[i].arg)
parsed[i].arg = stack[nesting] + 1;
default:
break;
}
}
free (stack);
assert (nesting == 0);

debug_dump ("bf-final.txt", parsed, parsed_len);

// - - JIT - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Functions preserve the registers rbx, rsp, rbp, r12, r13, r14, and r15;
// while rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch registers.

str_init (&program);
size_t *offsets = xcalloc (sizeof *offsets, parsed_len + 1);
uint8_t *arith = xcalloc (sizeof *arith, parsed_len);

#define CODE(x) { char t[] = x; str_append_data (&program, t, sizeof t - 1); }
#define WORD(x) { size_t t = (size_t)(x); str_append_data (&program, &t, 8); }
#define DWRD(x) { size_t t = (size_t)(x); str_append_data (&program, &t, 4); }

CODE ("\x49\xBD") WORD (&dataptr) // mov r13, qword "&dataptr"
CODE ("\x49\xBF") WORD (&data.str) // mov r15, qword "&data.str"
CODE ("\x4D\x8B\x37") // mov r14, qword [r15]
CODE ("\x30\xDB") // xor bl, bl

for (size_t i = 0; i < parsed_len; i++)
{
offsets[i] = program.len;

size_t arg = parsed[i].arg;
assert (arg <= UINT32_MAX);
switch (parsed[i].cmd)
{
case RIGHT:
CODE ("\x41\x88\x1E") // mov [r14], bl
CODE ("\xBF") DWRD (arg) // mov edi, "arg"
CODE ("\x48\xB8") WORD (right) // mov rax, "right"
CODE ("\xFF\xD0") // call rax

// The data could get reallocated, so reload the address
CODE ("\x4D\x8B\x37") // mov r14, qword [r15]
CODE ("\x4D\x03\x75\x00") // add r14, [r13]
break;
case LEFT:
CODE ("\x41\x88\x1E") // mov [r14], bl
CODE ("\xBF") DWRD (arg) // mov edi, "arg"
CODE ("\x49\x29\xFE") // sub r14, rdi -- optimistic
CODE ("\x48\xB8") WORD (left) // mov rax, "left"
CODE ("\xFF\xD0") // call rax
break;

case EAT:
CODE ("\x41\x88\xDC") // mov r12b, bl
CODE ("\x30\xDB") // xor bl, bl
arith[i] = 1;
break;
case INCACC:
CODE ("\x44\x00\xE3") // add bl, r12b
arith[i] = 1;
break;
case DECACC:
CODE ("\x44\x28\xE3") // sub bl, r12b
arith[i] = 1;
break;

case INC:
CODE ("\x80\xC3") // add bl, "arg"
str_append_c (&program, arg);
arith[i] = 1;
break;
case DEC:
CODE ("\x80\xEB") // sub bl, "arg"
str_append_c (&program, arg);
arith[i] = 1;
break;
case SET:
CODE ("\xB3") // mov bl, "arg"
str_append_c (&program, arg);
break;

case OUT:
CODE ("\x48\x0F\xB6\xFB") // movzx rdi, bl
CODE ("\x48\xBE") WORD (stdout) // mov rsi, "stdout"
CODE ("\x48\xB8") WORD (fputc) // mov rax, "fputc"
CODE ("\xFF\xD0") // call rax
break;
case IN:
CODE ("\x48\xB8") WORD (cin) // mov rax, "cin"
CODE ("\xFF\xD0") // call rax
CODE ("\x41\x8A\x1E") // mov bl, [r14]
break;

case BEGIN:
// Don't test the register when the flag has been set already;
// this doesn't have much of an effect in practice
if (!i || !arith[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x84\x00\x00\x00\x00") // jz "offsets[i]"
break;
case END:
// We know that the cell is zero, make this an "if", not a "loop";
// this doesn't have much of an effect in practice
if (!arg)
break;

if (!i || !arith[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x85\x00\x00\x00\x00") // jnz "offsets[i]"
break;
}

// No sense in reading it out when we overwrite it immediately;
// this doesn't have much of an effect in practice
if (parsed[i].cmd == LEFT || parsed[i].cmd == RIGHT)
if (i + 1 >= parsed_len
|| parsed[i + 1].cmd != SET)
CODE ("\x41\x8A\x1E") // mov bl, [r14]
}
// When there is a loop at the end we need to be able to jump past it
offsets[parsed_len] = program.len;
str_append_c (&program, '\xC3'); // ret

// Now that we know where each instruction is, fill in relative jumps;
// this must accurately reflect code generators for BEGIN and END
for (size_t i = 0; i < parsed_len; i++)
{
if ((parsed[i].cmd != BEGIN && parsed[i].cmd != END)
|| !parsed[i].arg)
continue;

size_t fixup = offsets[i] + 2;
if (!i || !arith[i - 1])
fixup += 2;

*(int32_t *)(program.str + fixup) =
((intptr_t)(offsets[parsed[i].arg]) - (intptr_t)(fixup + 4));
}
free (offsets);
free (arith);

#ifdef DEBUG
FILE *bin = fopen ("bf-jit.bin", "w");
fwrite (program.str, program.len, 1, bin);
fclose (bin);
#endif

// - - Runtime - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Some systems may have W^X
void *executable = mmap (NULL, program.len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!executable)
exit_fatal ("mmap: %s\n", strerror (errno));

memcpy (executable, program.str, program.len);
if (mprotect (executable, program.len, PROT_READ | PROT_EXEC))
exit_fatal ("mprotect: %s\n", strerror (errno));

str_init (&data);
str_append_c (&data, 0);
((void (*) (void)) executable)();
return 0;
}

+ 617
- 0
interpreters/bf-jit-unsafe-opt.c View File

@@ -0,0 +1,617 @@
// This is an exercise in futility more than anything else
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <errno.h>

#if (defined __x86_64__ || defined __amd64__) && defined __unix__
#include <unistd.h>
#include <sys/mman.h>
#else
#error Platform not supported
#endif

#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)

// --- Safe memory management --------------------------------------------------

static void *
xcalloc (size_t m, size_t n)
{
void *p = calloc (m, n);
if (!p)
exit_fatal ("calloc: %s\n", strerror (errno));
return p;
}

static void *
xrealloc (void *o, size_t n)
{
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
}

// --- Dynamically allocated strings -------------------------------------------

struct str
{
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
};

static void
str_init (struct str *self)
{
self->len = 0;
self->str = xcalloc (1, (self->alloc = 16));
}

static void
str_ensure_space (struct str *self, size_t n)
{
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
}

static void
str_append_data (struct str *self, const void *data, size_t n)
{
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
}

static void
str_append_c (struct str *self, char c)
{
str_append_data (self, &c, 1);
}

// --- Application -------------------------------------------------------------

enum command { RIGHT, LEFT, INC, DEC, SET, IN, OUT, BEGIN, END,
EAT, INCACC, DECACC };
bool grouped[] = { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
struct instruction { enum command cmd; int offset; size_t arg; };
#define INSTRUCTION(c, o, a) (struct instruction) { (c), (o), (a) }

// - - Callbacks - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

FILE *input; ///< User input

static int
cin (void)
{
int c = fgetc (input);
assert (c != EOF);
return c;
}

// - - Main - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#ifdef DEBUG
static void
debug_dump (const char *filename, struct instruction *in, size_t len)
{
FILE *fp = fopen (filename, "w");
long indent = 0;
for (size_t i = 0; i < len; i++)
{
if (in[i].cmd == END)
indent--;
for (long k = 0; k < indent; k++)
fprintf (fp, " ");

switch (in[i].cmd)
{
case RIGHT: fputs ("RIGHT ", fp); break;
case LEFT: fputs ("LEFT ", fp); break;
case INC: fputs ("INC ", fp); break;
case DEC: fputs ("DEC ", fp); break;
case OUT: fputs ("OUT ", fp); break;
case IN: fputs ("IN ", fp); break;
case BEGIN: fputs ("BEGIN ", fp); break;
case END: fputs ("END ", fp); break;
case SET: fputs ("SET ", fp); break;
case EAT: fputs ("EAT ", fp); break;
case INCACC: fputs ("INCACC", fp); break;
case DECACC: fputs ("DECACC", fp); break;
}
fprintf (fp, " %zu [%d]\n", in[i].arg, in[i].offset);
if (in[i].cmd == BEGIN)
indent++;
}
fclose (fp);
}
#else
#define debug_dump(...)
#endif

int
main (int argc, char *argv[])
{
(void) argc;
(void) argv;

struct str program;
str_init (&program);

int c;
while ((c = fgetc (stdin)) != EOF)
str_append_c (&program, c);
if (ferror (stdin))
exit_fatal ("can't read program\n");
if (!(input = fopen ("/dev/tty", "rb")))
exit_fatal ("can't open terminal for reading\n");

// - - Decode and group - - - - - - - - - - - - - - - - - - - - - - - - - - - -

struct instruction *parsed = xcalloc (sizeof *parsed, program.len);
size_t parsed_len = 0;

for (size_t i = 0; i < program.len; i++)
{
enum command cmd;
switch (program.str[i])
{
case '>': cmd = RIGHT; break;
case '<': cmd = LEFT; break;
case '+': cmd = INC; break;
case '-': cmd = DEC; break;
case '.': cmd = OUT; break;
case ',': cmd = IN; break;
case '[': cmd = BEGIN; break;
case ']': cmd = END; break;
default: continue;
}

// The most basic optimization is to group identical commands together
if (!parsed_len || !grouped[cmd] || parsed[parsed_len - 1].cmd != cmd)
parsed_len++;

parsed[parsed_len - 1].cmd = cmd;
parsed[parsed_len - 1].arg++;
}

// - - Optimization passes - - - - - - - - - - - - - - - - - - - - - - - - - - -

debug_dump ("bf-no-opt.txt", parsed, parsed_len);

size_t in = 0, out = 0;
for (; in < parsed_len; in++, out++)
{
if (in + 2 < parsed_len
&& parsed[in ].cmd == BEGIN
&& parsed[in + 1].cmd == DEC && parsed[in + 1].arg == 1
&& parsed[in + 2].cmd == END)
{
parsed[out] = INSTRUCTION (SET, 0, 0);
in += 2;
}
else if (out && parsed[out - 1].cmd == SET && parsed[in].cmd == INC)
parsed[--out].arg += parsed[in].arg;
else if (out != in)
parsed[out] = parsed[in];
}
parsed_len = out;

debug_dump ("bf-pre-offsets.txt", parsed, parsed_len);

// Add offsets to INC/DEC/SET stuck between LEFT/RIGHT
// and compress the LEFT/RIGHT sequences
for (in = 0, out = 0; in < parsed_len; in++, out++)
{
ssize_t dir = 0;
if (parsed[in].cmd == RIGHT)
dir = parsed[in].arg;
else if (parsed[in].cmd == LEFT)
dir = -(ssize_t) parsed[in].arg;
else
{
parsed[out] = parsed[in];
continue;
}

while (in + 2 < parsed_len)
{
// An immediate offset has its limits
if (dir < INT8_MIN || dir > INT8_MAX)
break;

ssize_t diff;
if (parsed[in + 2].cmd == RIGHT)
diff = parsed[in + 2].arg;
else if (parsed[in + 2].cmd == LEFT)
diff = -(ssize_t) parsed[in + 2].arg;
else
break;

int cmd = parsed[in + 1].cmd;
if (cmd != INC && cmd != DEC && cmd != SET)
break;

parsed[out] = parsed[in + 1];
parsed[out].offset = dir;

dir += diff;
out += 1;
in += 2;
}

for (; in + 1 < parsed_len; in++)
{
if (parsed[in + 1].cmd == RIGHT)
dir += parsed[in + 1].arg;
else if (parsed[in + 1].cmd == LEFT)
dir -= (ssize_t) parsed[in + 1].arg;
else
break;
}

if (!dir)
out--;
else if (dir > 0)
parsed[out] = INSTRUCTION (RIGHT, 0, dir);
else
parsed[out] = INSTRUCTION (LEFT, 0, -dir);
}
parsed_len = out;

debug_dump ("bf-pre-incdec-unloop.txt", parsed, parsed_len);

// Try to eliminate loops that eat a cell and add/subtract its value
// to/from some other cell
for (in = 0, out = 0; in < parsed_len; in++, out++)
{
parsed[out] = parsed[in];
if (parsed[in].cmd != BEGIN)
continue;

bool ok = false;
size_t count = 0;
for (size_t k = in + 1; k < parsed_len; k++)
{
if (parsed[k].cmd == END)
{
ok = true;
break;
}
if (parsed[k].cmd != INC
&& parsed[k].cmd != DEC)
break;
count++;
}
if (!ok)
continue;

// Stable sort operations by their offsets, put [0] first
bool sorted;
do
{
sorted = true;
for (size_t k = 1; k < count; k++)
{
if (parsed[in + k].offset == 0)
continue;
if (parsed[in + k + 1].offset != 0
&& parsed[in + k].offset <= parsed[in + k + 1].offset)
continue;

struct instruction tmp = parsed[in + k + 1];
parsed[in + k + 1] = parsed[in + k];
parsed[in + k] = tmp;
sorted = false;
}
}
while (!sorted);

// Abort the optimization on duplicate offsets (complication with [0])
for (size_t k = 1; k < count; k++)
if (parsed[in + k].offset == parsed[in + k + 1].offset)
ok = false;
// XXX: can't make the code longer either
for (size_t k = 1; k <= count; k++)
if (parsed[in + k].arg != 1)
ok = false;
if (!ok
|| parsed[in + 1].cmd != DEC
|| parsed[in + 1].offset != 0)
continue;

int min_safe_left_offset = 0;
if (in > 1 && parsed[in - 1].cmd == RIGHT)
min_safe_left_offset = -parsed[in - 1].arg;

bool cond_needed_for_safety = false;
for (size_t k = 0; k < count; k++)
if (parsed[in + k + 1].offset < min_safe_left_offset)
{
cond_needed_for_safety = true;
break;
}

in++;
if (cond_needed_for_safety)
out++;

parsed[out] = INSTRUCTION (EAT, 0, 0);
for (size_t k = 1; k < count; k++)
parsed[out + k] = INSTRUCTION (parsed[in + k].cmd == INC
? INCACC : DECACC, parsed[in + k].offset, 0);

in += count;
out += count;

if (cond_needed_for_safety)
parsed[out] = INSTRUCTION (END, 0, 0);
else
out--;
}
parsed_len = out;

debug_dump ("bf-optimized.txt", parsed, parsed_len);

// - - Loop pairing - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

size_t nesting = 0;
size_t *stack = xcalloc (sizeof *stack, parsed_len);
for (size_t i = 0; i < parsed_len; i++)
{
switch (parsed[i].cmd)
{
case BEGIN:
stack[nesting++] = i;
break;
case END:
assert (nesting > 0);

--nesting;
parsed[stack[nesting]].arg = i + 1;

// Looping can be disabled by optimizations
if (parsed[i].arg)
parsed[i].arg = stack[nesting] + 1;
default:
break;
}
}
free (stack);
assert (nesting == 0);

debug_dump ("bf-final.txt", parsed, parsed_len);

// - - JIT - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Functions preserve the registers rbx, rsp, rbp, r12, r13, r14, and r15;
// while rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch registers.

str_init (&program);
size_t *offsets = xcalloc (sizeof *offsets, parsed_len + 1);
uint8_t *arith = xcalloc (sizeof *arith, parsed_len);

#define CODE(x) { char t[] = x; str_append_data (&program, t, sizeof t - 1); }
#define WORD(x) { size_t t = (size_t)(x); str_append_data (&program, &t, 8); }
#define DWRD(x) { size_t t = (size_t)(x); str_append_data (&program, &t, 4); }

CODE ("\x48\x89\xF8") // mov rax, rdi
CODE ("\x30\xDB") // xor bl, bl

for (size_t i = 0; i < parsed_len; i++)
{
offsets[i] = program.len;

size_t arg = parsed[i].arg;
assert (arg <= UINT32_MAX);

int offset = parsed[i].offset;
assert (offset <= INT8_MAX && offset >= INT8_MIN);

// Don't save what we've just loaded
if (parsed[i].cmd == LEFT || parsed[i].cmd == RIGHT)
if (i < 2 || i + 1 >= parsed_len
|| (parsed[i - 2].cmd != LEFT && parsed[i - 2].cmd != RIGHT)
|| parsed[i - 1].cmd != BEGIN
|| parsed[i + 1].cmd != END)
CODE ("\x88\x18") // mov [rax], bl

switch (parsed[i].cmd)
{
case RIGHT:
// add rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX)
{ CODE ("\x48\x05") DWRD (arg) }
else
{ CODE ("\x48\x83\xC0") str_append_c (&program, arg); }
break;
case LEFT:
// sub rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX)
{ CODE ("\x48\x2D") DWRD (arg) }
else
{ CODE ("\x48\x83\xE8") str_append_c (&program, arg); }
break;

case EAT:
CODE ("\x41\x88\xDC") // mov r12b, bl
CODE ("\x30\xDB") // xor bl, bl
arith[i] = 1;
break;
case INCACC:
if (offset)
{
CODE ("\x44\x00\x60") // add [rax+"offset"], r12b
str_append_c (&program, offset);
}
else
{
CODE ("\x44\x00\xE3") // add bl, r12b
arith[i] = 1;
}
break;
case DECACC:
if (offset)
{
CODE ("\x44\x28\x60") // sub [rax+"offset"], r12b
str_append_c (&program, offset);
}
else
{
CODE ("\x44\x28\xE3") // sub bl, r12b
arith[i] = 1;
}
break;

case INC:
if (offset)
{
CODE ("\x80\x40") // add byte [rax+"offset"], "arg"
str_append_c (&program, offset);
}
else
{
arith[i] = 1;
CODE ("\x80\xC3") // add bl, "arg"
}
str_append_c (&program, arg);
break;
case DEC:
if (offset)
{
CODE ("\x80\x68") // sub byte [rax+"offset"], "arg"
str_append_c (&program, offset);
}
else
{
arith[i] = 1;
CODE ("\x80\xEB") // sub bl, "arg"
}
str_append_c (&program, arg);
break;
case SET:
if (offset)
{
CODE ("\xC6\x40") // mov byte [rax+"offset"], "arg"
str_append_c (&program, offset);
}
else
CODE ("\xB3") // mov bl, "arg"
str_append_c (&program, arg);
break;

case OUT:
CODE ("\x50\x53") // push rax, push rbx
CODE ("\x48\x0F\xB6\xFB") // movzx rdi, bl
CODE ("\x48\xBE") WORD (stdout) // mov rsi, "stdout"
CODE ("\x48\xB8") WORD (fputc) // mov rax, "fputc"
CODE ("\xFF\xD0") // call rax
CODE ("\x5B\x58") // pop rbx, pop rax
break;
case IN:
CODE ("\x50") // push rax
CODE ("\x48\xB8") WORD (cin) // mov rax, "cin"
CODE ("\xFF\xD0") // call rax
CODE ("\x88\xC3") // mov bl, al
CODE ("\x58") // pop rax
break;

case BEGIN:
// Don't test the register when the flag has been set already;
// this doesn't have much of an effect in practice
if (!i || !arith[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x84\x00\x00\x00\x00") // jz "offsets[i]"
break;
case END:
// We know that the cell is zero, make this an "if", not a "loop";
// this doesn't have much of an effect in practice
if (!arg)
break;

if (!i || !arith[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x85\x00\x00\x00\x00") // jnz "offsets[i]"
break;
}

// No sense in reading it out when we overwrite it immediately;
// this doesn't have much of an effect in practice
if (parsed[i].cmd == LEFT || parsed[i].cmd == RIGHT)
if (i + 1 >= parsed_len
|| parsed[i + 1].cmd != SET
|| parsed[i + 1].offset != 0)
CODE ("\x8A\x18") // mov bl, [rax]
}
// When there is a loop at the end we need to be able to jump past it
offsets[parsed_len] = program.len;
str_append_c (&program, '\xC3'); // ret

// Now that we know where each instruction is, fill in relative jumps;
// this must accurately reflect code generators for BEGIN and END
for (size_t i = 0; i < parsed_len; i++)
{
if ((parsed[i].cmd != BEGIN && parsed[i].cmd != END)
|| !parsed[i].arg)
continue;

size_t fixup = offsets[i] + 2;
if (!i || !arith[i - 1])
fixup += 2;

*(int32_t *)(program.str + fixup) =
((intptr_t)(offsets[parsed[i].arg]) - (intptr_t)(fixup + 4));
}
free (offsets);
free (arith);

#ifdef DEBUG
FILE *bin = fopen ("bf-jit.bin", "w");
fwrite (program.str, program.len, 1, bin);
fclose (bin);
#endif

// - - Runtime - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

// Some systems may have W^X
void *executable = mmap (NULL, program.len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!executable)
exit_fatal ("mmap: %s\n", strerror (errno));

memcpy (executable, program.str, program.len);
if (mprotect (executable, program.len, PROT_READ | PROT_EXEC))
exit_fatal ("mprotect: %s\n", strerror (errno));

// We create crash zones on both ends of the tape for some minimum safety
long pagesz = sysconf (_SC_PAGESIZE);
assert (pagesz > 0);

const size_t tape_len = (1 << 20) + 2 * pagesz;
char *tape = mmap (NULL, tape_len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!tape)
exit_fatal ("mmap: %s\n", strerror (errno));

memset (tape, 0, tape_len);
if (mprotect (tape, pagesz, PROT_NONE)
|| mprotect (tape + tape_len - pagesz, pagesz, PROT_NONE))
exit_fatal ("mprotect: %s\n", strerror (errno));

((void (*) (char *)) executable)(tape + pagesz);
return 0;
}

+ 495
- 0
interpreters/bf-jit-unsafe.c View File

@@ -0,0 +1,495 @@
// This is an exercise in futility more than anything else
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <errno.h>

#if (defined __x86_64__ || defined __amd64__) && defined __unix__
#include <unistd.h>
#include <sys/mman.h>
#else
#error Platform not supported
#endif

#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)

// --- Safe memory management --------------------------------------------------

static void *
xcalloc (size_t m, size_t n)
{
void *p = calloc (m, n);
if (!p)
exit_fatal ("calloc: %s\n", strerror (errno));
return p;
}

static void *
xrealloc (void *o, size_t n)
{
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
}

// --- Dynamically allocated strings -------------------------------------------

struct str
{
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
};

static void
str_init (struct str *self)
{
self->len = 0;
self->str = xcalloc (1, (self->alloc = 16));
}

static void
str_ensure_space (struct str *self, size_t n)
{
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
}

static void
str_append_data (struct str *self, const void *data, size_t n)
{
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
}

static void
str_append_c (struct str *self, char c)
{
str_append_data (self, &c, 1);
}

// --- Application -------------------------------------------------------------

enum command { RIGHT, LEFT, INC, DEC, SET, IN, OUT, BEGIN, END,
EAT, INCACC, DECACC };
bool grouped[] = { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
struct instruction { enum command cmd; size_t arg; };