
742 lines
21 KiB

// This is an exercise in futility more than anything else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <assert.h>
#include <errno.h>
#ifdef __unix__
#include <fcntl.h>
#define exit_fatal(...) \
do { \
fprintf (stderr, "fatal: " __VA_ARGS__); \
exit (EXIT_FAILURE); \
} while (0)
// --- Safe memory management --------------------------------------------------
static void *
xcalloc (size_t m, size_t n)
void *p = calloc (m, n);
if (!p)
exit_fatal ("calloc: %s\n", strerror (errno));
return p;
static void *
xrealloc (void *o, size_t n)
void *p = realloc (o, n);
if (!p && n)
exit_fatal ("realloc: %s\n", strerror (errno));
return p;
// --- Dynamically allocated strings -------------------------------------------
struct str
char *str; ///< String data, null terminated
size_t alloc; ///< How many bytes are allocated
size_t len; ///< How long the string actually is
static void
str_init (struct str *self)
self->len = 0;
self->str = xcalloc (1, (self->alloc = 16));
static void
str_ensure_space (struct str *self, size_t n)
// We allocate at least one more byte for the terminating null character
size_t new_alloc = self->alloc;
while (new_alloc <= self->len + n)
new_alloc <<= 1;
if (new_alloc != self->alloc)
self->str = xrealloc (self->str, (self->alloc = new_alloc));
static void
str_append_data (struct str *self, const void *data, size_t n)
str_ensure_space (self, n);
memcpy (self->str + self->len, data, n);
self->str[self->len += n] = '\0';
static void
str_append_c (struct str *self, char c)
str_append_data (self, &c, 1);
// --- Application -------------------------------------------------------------
enum command
bool grouped[] = { 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
struct instruction { enum command cmd; int offset; size_t arg; };
#define INSTRUCTION(c, o, a) (struct instruction) { (c), (o), (a) }
// - - Debugging - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#ifdef DEBUG
static void
debug_dump_instruction (FILE *fp, const struct instruction *in)
const char *name;
switch (in->cmd)
case RIGHT: name = "RIGHT "; break;
case LEFT: name = "LEFT "; break;
case INC: name = "INC "; break;
case DEC: name = "DEC "; break;
case OUT: name = "OUT "; break;
case IN: name = "IN "; break;
case BEGIN: name = "BEGIN "; break;
case END: name = "END "; break;
case SET: name = "SET "; break;
case EAT: name = "EAT "; break;
case INCACC: name = "INCACC"; break;
case DECACC: name = "DECACC"; break;
fprintf (fp, "%s %zu", name, in->arg);
if (in->offset != 0)
fprintf (fp, " [%d]", in->offset);
fprintf (fp, "\n");
static void
debug_dump (const char *filename, struct instruction *in, size_t len)
FILE *fp = fopen (filename, "w");
long indent = 0;
for (size_t i = 0; i < len; i++)
if (in[i].cmd == END)
for (long k = 0; k < indent; k++)
fputs (" ", fp);
debug_dump_instruction (fp, &in[i]);
if (in[i].cmd == BEGIN)
fclose (fp);
#define debug_dump(...)
// - - Optimization passes - - - - - - - - - - - - - - - - - - - - - - - - - - -
static size_t
optimize_assignment (struct instruction *irb, size_t irb_len)
size_t in = 0, out = 0;
for (; in < irb_len; in++, out++)
if (in + 2 < irb_len
&& irb[in ].cmd == BEGIN
&& irb[in + 1].cmd == DEC && irb[in + 1].arg == 1
&& irb[in + 2].cmd == END)
irb[out] = INSTRUCTION (SET, 0, 0);
in += 2;
else if (out && irb[out - 1].cmd == SET && irb[in].cmd == INC)
irb[--out].arg += irb[in].arg;
else if (out != in)
irb[out] = irb[in];
return out;
// Add the offset of the LEFT/RIGHT instruction to the accumulator
static bool
add_direction_offset (struct instruction *irb, intptr_t *acc)
if (irb->cmd == RIGHT)
*acc += irb->arg;
else if (irb->cmd == LEFT)
*acc -= (intptr_t) irb->arg;
return false;
return true;
// Add offsets to INC/DEC/SET stuck between LEFT/RIGHT
// and compress the LEFT/RIGHT sequences
static size_t
optimize_offseted_inc_dec (struct instruction *irb, size_t irb_len)
size_t in = 0, out = 0;
for (in = 0, out = 0; in < irb_len; in++, out++)
intptr_t dir = 0;
if (!add_direction_offset (&irb[in], &dir))
irb[out] = irb[in];
while (in + 2 < irb_len)
// An immediate offset has its limits on x86-64
if (dir < INT8_MIN || dir > INT8_MAX)
intptr_t diff = 0;
if (!add_direction_offset (&irb[in + 2], &diff))
int cmd = irb[in + 1].cmd;
if (cmd != INC && cmd != DEC && cmd != SET)
irb[out] = irb[in + 1];
irb[out].offset = dir;
dir += diff;
out += 1;
in += 2;
for (; in + 1 < irb_len; in++)
if (!add_direction_offset (&irb[in + 1], &dir))
if (!dir)
else if (dir > 0)
irb[out] = INSTRUCTION (RIGHT, 0, dir);
irb[out] = INSTRUCTION (LEFT, 0, -dir);
return out;
// Try to eliminate loops that eat a cell and add/subtract its value
// to/from some other cell
static size_t
optimize_inc_dec_loops (struct instruction *irb, size_t irb_len)
size_t in = 0, out = 0;
for (in = 0, out = 0; in < irb_len; in++, out++)
irb[out] = irb[in];
if (irb[in].cmd != BEGIN)
bool ok = false;
size_t count = 0;
for (size_t k = in + 1; k < irb_len; k++)
if (irb[k].cmd == END)
ok = true;
if (irb[k].cmd != INC
&& irb[k].cmd != DEC)
if (!ok)
// Stable sort operations by their offsets, put [0] first
bool sorted;
sorted = true;
for (size_t k = 1; k < count; k++)
if (irb[in + k].offset == 0)
if (irb[in + k + 1].offset != 0
&& irb[in + k].offset <= irb[in + k + 1].offset)
struct instruction tmp = irb[in + k + 1];
irb[in + k + 1] = irb[in + k];
irb[in + k] = tmp;
sorted = false;
while (!sorted);
// Abort the optimization on duplicate offsets (complication with [0])
for (size_t k = 1; k < count; k++)
if (irb[in + k].offset == irb[in + k + 1].offset)
ok = false;
// XXX: can't make the code longer either
for (size_t k = 1; k <= count; k++)
if (irb[in + k].arg != 1)
ok = false;
if (!ok
|| irb[in + 1].cmd != DEC
|| irb[in + 1].offset != 0)
int min_safe_left_offset = 0;
if (in > 1 && irb[in - 1].cmd == RIGHT)
min_safe_left_offset = -irb[in - 1].arg;
bool cond_needed_for_safety = false;
for (size_t k = 0; k < count; k++)
if (irb[in + k + 1].offset < min_safe_left_offset)
cond_needed_for_safety = true;
if (cond_needed_for_safety)
irb[out] = INSTRUCTION (EAT, 0, 0);
for (size_t k = 1; k < count; k++)
irb[out + k] = INSTRUCTION (irb[in + k].cmd == INC
? INCACC : DECACC, irb[in + k].offset, 0);
in += count;
out += count;
if (cond_needed_for_safety)
irb[out] = INSTRUCTION (END, 0, 0);
return out;
// - - Loop pairing - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
static void
pair_loops (struct instruction *irb, size_t irb_len)
size_t nesting = 0;
size_t *stack = xcalloc (sizeof *stack, irb_len);
for (size_t i = 0; i < irb_len; i++)
switch (irb[i].cmd)
case BEGIN:
stack[nesting++] = i;
case END:
if (nesting <= 0)
exit_fatal ("unbalanced loops\n");
irb[stack[nesting]].arg = i + 1;
// Looping can be disabled by optimizations
if (irb[i].arg)
irb[i].arg = stack[nesting] + 1;
free (stack);
if (nesting != 0)
exit_fatal ("unbalanced loops\n");
// - - Main - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
main (int argc, char *argv[])
if (argc > 3)
exit_fatal ("usage: %s [INPUT-FILE] [OUTPUT-FILE]\n", argv[0]);
FILE *input_file = stdin;
if (argc > 1 && !(input_file = fopen (argv[1], "r")))
exit_fatal ("fopen: %s: %s\n", argv[1], strerror (errno));
const char *output_path = "a.out";
if (argc > 2)
output_path = argv[2];
struct str buffer;
str_init (&buffer);
int c;
while ((c = fgetc (input_file)) != EOF)
str_append_c (&buffer, c);
if (ferror (input_file))
exit_fatal ("can't read program\n");
fclose (input_file);
// - - Decode, group and optimize - - - - - - - - - - - - - - - - - - - - - - -
// This is our Intermediate Representation Buffer
struct instruction *irb = xcalloc (sizeof *irb, buffer.len);
size_t irb_len = 0;
for (size_t i = 0; i < buffer.len; i++)
enum command cmd;
switch (buffer.str[i])
case '>': cmd = RIGHT; break;
case '<': cmd = LEFT; break;
case '+': cmd = INC; break;
case '-': cmd = DEC; break;
case '.': cmd = OUT; break;
case ',': cmd = IN; break;
case '[': cmd = BEGIN; break;
case ']': cmd = END; break;
default: continue;
// The most basic optimization is to group identical commands together
if (!irb_len || !grouped[cmd] || irb[irb_len - 1].cmd != cmd)
irb[irb_len - 1].cmd = cmd;
irb[irb_len - 1].arg++;
debug_dump ("bf-no-opt.txt", irb, irb_len);
irb_len = optimize_assignment (irb, irb_len);
debug_dump ("bf-pre-offsets.txt", irb, irb_len);
irb_len = optimize_offseted_inc_dec (irb, irb_len);
debug_dump ("bf-pre-incdec-unloop.txt", irb, irb_len);
irb_len = optimize_inc_dec_loops (irb, irb_len);
debug_dump ("bf-optimized.txt", irb, irb_len);
pair_loops (irb, irb_len);
debug_dump ("bf-final.txt", irb, irb_len);
// - - Code generation - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
str_init (&buffer);
size_t *offsets = xcalloc (sizeof *offsets, irb_len + 1);
bool *sets_flags = xcalloc (sizeof *sets_flags, irb_len);
#define CODE(x) { char t[] = x; str_append_data (&buffer, t, sizeof t - 1); }
#define LE(v) (uint8_t[]) { v, v>>8, v>>16, v>>24, v>>32, v>>40, v>>48, v>>56 }
#define DB(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 1); }
#define DW(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 2); }
#define DD(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 4); }
#define DQ(x) { uint64_t v = (x); str_append_data (&buffer, LE (v), 8); }
ELF_LOAD_CODE = 0x400000, // where code is loaded (usual)
ELF_LOAD_DATA = 0x800000 // where the tape is placed
CODE ("\xB8") DD (ELF_LOAD_DATA) // mov rax, "ELF_LOAD_DATA"
CODE ("\x30\xDB") // xor bl, bl
for (size_t i = 0; i < irb_len; i++)
offsets[i] = buffer.len;
size_t arg = irb[i].arg;
assert (arg <= UINT32_MAX);
int offset = irb[i].offset;
assert (offset <= INT8_MAX && offset >= INT8_MIN);
// Don't save what we've just loaded
if (irb[i].cmd == LEFT || irb[i].cmd == RIGHT)
if (i < 2 || i + 1 >= irb_len
|| (irb[i - 2].cmd != LEFT && irb[i - 2].cmd != RIGHT)
|| irb[i - 1].cmd != BEGIN
|| irb[i + 1].cmd != END)
CODE ("\x88\x18") // mov [rax], bl
switch (irb[i].cmd)
case RIGHT:
// add rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX) { CODE ("\x48\x05") DD (arg) }
else { CODE ("\x48\x83\xC0") DB (arg) }
case LEFT:
// sub rax, "arg" -- optimistic, no boundary checking
if (arg > INT8_MAX) { CODE ("\x48\x2D") DD (arg) }
else { CODE ("\x48\x83\xE8") DB (arg) }
case EAT:
// NOTE: the kernel destroys rcx and r11 on syscalls,
// there must be no OUT or IN between EAT and INCACC/DECACC
CODE ("\x88\xD9" "\x30\xDB") // mov cl, bl; xor bl, bl
sets_flags[i] = true;
case INCACC:
if (offset)
CODE ("\x00\x48") DB (offset) // add [rax+"offset"], cl
CODE ("\x00\xCB") // add bl, cl
sets_flags[i] = true;
case DECACC:
if (offset)
CODE ("\x28\x48") DB (offset) // sub [rax+"offset"], cl
CODE ("\x28\xCB") // sub bl, cl
sets_flags[i] = true;
case INC:
if (offset)
CODE ("\x80\x40") DB (offset) // add byte [rax+"offset"], "arg"
CODE ("\x80\xC3") // add bl, "arg"
sets_flags[i] = true;
DB (arg)
case DEC:
if (offset)
CODE ("\x80\x68") DB (offset) // sub byte [rax+"offset"], "arg"
CODE ("\x80\xEB") // sub bl, "arg"
sets_flags[i] = true;
DB (arg)
case SET:
if (offset)
CODE ("\xC6\x40") DB (offset) // mov byte [rax+"offset"], "arg"
CODE ("\xB3") // mov bl, "arg"
DB (arg)
case OUT:
CODE ("\xE8") DD (0) // call "write"
case IN:
CODE ("\xE8") DD (0) // call "read"
case BEGIN:
// Don't test the register when the flag has been set already;
// this doesn't have much of an effect in practice
if (!i || !sets_flags[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x84\x00\x00\x00\x00") // jz "offsets[arg]"
case END:
// We know that the cell is zero, make this an "if", not a "loop";
// this doesn't have much of an effect in practice
if (!arg)
if (!i || !sets_flags[i - 1])
CODE ("\x84\xDB") // test bl, bl
CODE ("\x0F\x85\x00\x00\x00\x00") // jnz "offsets[arg]"
// No sense in reading it out when we overwrite it immediately;
// this doesn't have much of an effect in practice
if (irb[i].cmd == LEFT || irb[i].cmd == RIGHT)
if (i + 1 >= irb_len
|| irb[i + 1].cmd != SET
|| irb[i + 1].offset != 0)
CODE ("\x8A\x18") // mov bl, [rax]
// When there is a loop at the end we need to be able to jump past it
offsets[irb_len] = buffer.len;
// Write an epilog which handles all the OS interfacing
// System V x86-64 ABI:
// rax <-> both syscall number and return value
// args -> rdi, rsi, rdx, r10, r8, r9
// trashed <- rcx, r11
enum { SYS_READ = 3, SYS_WRITE = 4, SYS_EXIT = 1 };
#elif defined TARGET_LINUX
enum { SYS_READ = 0, SYS_WRITE = 1, SYS_EXIT = 60 };
#error Target not supported
CODE ("\xB8") DD (SYS_EXIT) // mov eax, "SYS_EXIT"
CODE ("\x48\x31\xFF") // xor rdi, rdi
CODE ("\x0F\x05") // syscall
size_t fatal_offset = buffer.len;
CODE ("\x48\x89\xF7") // mov rdi, rsi -- use the string in rsi
CODE ("\x30\xC0") // xor al, al -- look for the nil byte
CODE ("\x48\x31\xC9") // xor rcx, rcx
CODE ("\x48\xF7\xD1") // not rcx -- start from -1
CODE ("\xFC" "\xF2\xAE") // cld; repne scasb -- decrement until found
CODE ("\x48\xF7\xD1") // not rcx
CODE ("\x48\x8D\x51\xFF") // lea rdx, [rcx-1] -- save length in rdx
CODE ("\xB8") DD (SYS_WRITE) // mov eax, "SYS_WRITE"
CODE ("\xBF") DD (2) // mov edi, "STDERR_FILENO"
CODE ("\x0F\x05") // syscall
CODE ("\xB8") DD (SYS_EXIT) // mov eax, "SYS_EXIT"
CODE ("\xBF") DD (1) // mov edi, "EXIT_FAILURE"
CODE ("\x0F\x05") // syscall
size_t io_offset = buffer.len;
CODE ("\x48\x89\xE6") // mov rsi, rsp -- the char starts at rsp
CODE ("\xBA") DD (1) // mov edx, 1 -- count
CODE ("\x0F\x05") // syscall
CODE ("\x48\x83\xF8\x00") // cmp rax, 0
CODE ("\x4C\x89\xE6") // mov rsi, r12
CODE ("\x7C") // jl "fatal_offset" -- write failure message
DB ((intptr_t) fatal_offset - (intptr_t) (buffer.len + 1))
CODE ("\x66\x5B") // pop bx
CODE ("\x58") // pop rax -- restore tape position
CODE ("\xC3") // ret
size_t read_offset = buffer.len;
CODE ("\x50") // push rax -- save tape position
CODE ("\xB8") DD (SYS_READ) // mov eax, "SYS_READ"
CODE ("\xBF") DD (0) // mov edi, "STDIN_FILENO"
CODE ("\x66\x6A\x00") // push word 0 -- the default value for EOF
CODE ("\x4C\x8D\x25") DD (2) // lea r12, [rel read_message]
CODE ("\xEB") // jmp "io_offset"
DB ((intptr_t) io_offset - (intptr_t) (buffer.len + 1))
CODE ("fatal: read failed\n\0")
size_t write_offset = buffer.len;
CODE ("\x50") // push rax -- save tape position
CODE ("\xB8") DD (SYS_WRITE) // mov eax, "SYS_WRITE"
CODE ("\xBF") DD (1) // mov edi, "STDOUT_FILENO"
CODE ("\x66\x53") // push bx
CODE ("\x4C\x8D\x25") DD (2) // lea r12, [rel write_message]
CODE ("\xEB") // jmp "io_offset"
DB ((intptr_t) io_offset - (intptr_t) (buffer.len + 1))
CODE ("fatal: write failed\n\0")
// Now that we know where each instruction is, fill in relative jumps
for (size_t i = 0; i < irb_len; i++)
if (!irb[i].arg)
// This must accurately reflect the code generators
intptr_t target, fixup = offsets[i];
if (irb[i].cmd == BEGIN || irb[i].cmd == END)
fixup += (i && sets_flags[i - 1]) ? 2 : 4;
target = offsets[irb[i].arg];
else if (irb[i].cmd == IN) { fixup++; target = read_offset; }
else if (irb[i].cmd == OUT) { fixup++; target = write_offset; }
else continue;
uint64_t v = target - (fixup + 4);
memcpy (buffer.str + fixup, LE (v), 4);
free (offsets);
free (sets_flags);
// - - Output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Now that we know how long the machine code is, we can write the header.
// Note that for PIE we would need to depend on the dynamic linker, so no.
// Recommended reading:
// man 5 elf
struct str code = buffer;
str_init (&buffer);
ELF_HEADER_SIZE = 64, // size of the ELF header
ELF_PROGRAM_ENTRY_SIZE = 56, // size of a program header
ELF_SECTION_ENTRY_SIZE = 64, // size of a section header
// ELF header
CODE ("\x7F" "ELF\x02\x01\x01") // ELF, 64-bit, little endian, v1
// OpenBSD either requires its ABI or a PT_NOTE with "OpenBSD" in it
CODE ("\x0C\x00" "\0\0\0\0\0\0\0") // OpenBSD ABI, v0, padding
CODE ("\x00\x00" "\0\0\0\0\0\0\0") // Unix System V ABI, v0, padding
DW (2) DW (62) DD (1) // executable, x86-64, v1
DQ (ELF_LOAD_CODE + ELF_META_SIZE) // entry point address
DQ (ELF_HEADER_SIZE) DQ (0) // program, section header offset
DD (0) // no processor-specific flags
DW (ELF_HEADER_SIZE) // ELF header size
DW (ELF_PROGRAM_ENTRY_SIZE) DW (2) // program hdr tbl entry size, count
DW (ELF_SECTION_ENTRY_SIZE) DW (0) // section hdr tbl entry size, count
DW (0) // no section index for strings
// Program header for code
// The entry point address seems to require alignment, so map start of file
DD (1) DD (5) // PT_LOAD, PF_R | PF_X
DQ (0) // offset within the file
DQ (ELF_LOAD_CODE) // address in virtual memory
DQ (ELF_LOAD_CODE) // address in physical memory
DQ (ELF_META_SIZE + code.len) // length within the file
DQ (ELF_META_SIZE + code.len) // length within memory
DQ (4096) // segment alignment
// Program header for the tape
DD (1) DD (6) // PT_LOAD, PF_R | PF_W
DQ (0) // offset within the file
DQ (ELF_LOAD_DATA) // address in virtual memory
DQ (ELF_LOAD_DATA) // address in physical memory
DQ (0) // length within the file
DQ (1 << 20) // one megabyte of memory
DQ (4096) // segment alignment
// The section header table is optional and we don't need it for anything
FILE *output_file;
#ifdef __unix__
int output_fd;
if ((output_fd = open (output_path, O_CREAT|O_WRONLY|O_TRUNC, 0777)) < 0)
exit_fatal ("open: %s: %s\n", output_path, strerror (errno));
if (!(output_file = fdopen (output_fd, "w")))
exit_fatal ("fdopen: %s\n", strerror (errno));
if (!(output_file = fopen (output_path, "w")))
exit_fatal ("fopen: %s: %s\n", output_path, strerror (errno));
fwrite (buffer.str, buffer.len, 1, output_file);
fwrite (code.str, code.len, 1, output_file);
fclose (output_file);
return 0;