lmc/assembler.go

package main

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"strconv"
	"strings"
)

const (
	WORD        = iota // [A-Za-z_-]+
	INSTRUCTION        // Instruction word
	NUMBER             // [0-9]+
	NEWLINE            // \n
	ERROR              // Error
)

type location struct {
	line   int
	column int
}

type token struct {
	location    location // Position of the token
	value       string   // Text content of the token
	instruction int      // INSTRUCTION ID
	kind        int      // Kind of the token
}

type tokenizer struct {
	location location      // Current position
	value    []byte        // Current token string
	reader   *bufio.Reader // Reader
	tokens   chan<- token  // Output token channel
}

// -----------------------------------------------------------------------------

func isSpace(c byte) bool {
	return c == ' ' || c == '\r' || c == '\t'
}

func isNumber(c byte) bool {
	return c >= '0' && c <= '9'
}

func isWordHead(c byte) bool {
	if c >= 'a' && c <= 'z' {
		c -= 32
	}
	return c >= 'A' && c <= 'Z' || c == '_'
}

func isWordTail(c byte) bool {
	return isWordHead(c) || isNumber(c)
}

// -----------------------------------------------------------------------------

const (
	IHALT = iota * 100
	IADD
	ISUBTRACT
	ISTORE
	_
	ILOAD
	IBRANCH
	IBRANCH_IF_ZERO
	IBRANCH_IF_POSITIVE
	IIO
)

const (
	_ = iota
	IO_INPUT
	IO_OUTPUT
)

var instructions = map[string]int{
	"HLT": IHALT,
	"COB": IHALT,
	"ADD": IADD,
	"SUB": ISUBTRACT,
	"STA": ISTORE,
	"LDA": ILOAD,
	"BRA": IBRANCH,
	"BRZ": IBRANCH_IF_ZERO,
	"BRP": IBRANCH_IF_POSITIVE,
	"INP": IIO + IO_INPUT,
	"OUT": IIO + IO_OUTPUT,
	"DAT": 0,
}

// -----------------------------------------------------------------------------

func (t *tokenizer) send(start location, kind int) {
	tok := token{start, strings.ToUpper(string(t.value)), 0, kind}
	if kind == WORD {
		if instr, found := instructions[tok.value]; found {
			tok.kind = INSTRUCTION
			tok.instruction = instr
		}
	}
	t.tokens <- tok
	t.value = []byte{}
}

// XXX: the handling could probably be simplified by extending the "byte"
//   to also include a special out-of-band value for errors
func (t *tokenizer) peek() (byte, error) {
	if buf, err := t.reader.Peek(1); err != nil {
		return '?', err
	} else {
		return buf[0], nil
	}
}

func (t *tokenizer) eat() (byte, error) {
	c, err := t.reader.ReadByte()
	if err != nil {
		return 0, err
	}

	if c == '\n' {
		t.location.line++
		t.location.column = 1
	} else {
		t.location.column++
	}
	return c, nil
}

// -----------------------------------------------------------------------------

func (t *tokenizer) step() error {
	start := t.location
	t.value = []byte{}

	c, err := t.peek()
	if err != nil {
		return err
	}

	switch {
	case isSpace(c):
		c, err = t.eat()
	case c == '\n':
		c, err = t.eat()
		t.value = append(t.value, c)

		t.send(start, NEWLINE)
	case isNumber(c):
		for isNumber(c) {
			c, err = t.eat()
			t.value = append(t.value, c)

			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
		t.send(start, NUMBER)
	case isWordHead(c):
		for isWordTail(c) {
			c, err = t.eat()
			t.value = append(t.value, c)

			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
		t.send(start, WORD)
	case c == '/':
		c, err = t.eat()
		c, err = t.peek()
		if err == io.EOF {
			return errors.New("unexpected EOF")
		}
		if err != nil {
			return err
		}

		if c != '/' {
			return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))
		}
		for c != '\n' {
			c, err = t.eat()
			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
	default:
		return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))
	}
	return nil
}

func tokenize(r io.Reader, tokens chan<- token) {
	t := tokenizer{
		location: location{line: 1, column: 1},
		tokens:   tokens,
		reader:   bufio.NewReader(r),
	}
	for {
		if err := t.step(); err == io.EOF {
			break
		} else if err != nil {
			t.tokens <- token{t.location, err.Error(), 0, ERROR}
			break
		}
	}
	close(tokens)
}

// -----------------------------------------------------------------------------

type instruction struct {
	id     int    // What instruction this is
	target string // Label name
	number int    // Immediate value
}

type assembler struct {
	tokens chan token     // Where tokens come from
	output []instruction  // The assembled program
	labels map[string]int // Addresses of labels
}

func (a *assembler) step() (bool, error) {
	token, ok := <-a.tokens
	if !ok {
		return false, nil
	}

	mkerr := func(format string, a ...interface{}) error {
		prefix := fmt.Sprintf("line %d, column %d: ",
			token.location.line, token.location.column)
		return errors.New(prefix + fmt.Sprintf(format, a...))
	}
	switch token.kind {
	case WORD:
		if _, dup := a.labels[token.value]; dup {
			return false, mkerr("duplicate label: %s", token.value)
		}
		a.labels[token.value] = len(a.output)

		if token, ok = <-a.tokens; !ok {
			return false, mkerr("unexpected end of file")
		}
		if token.kind != INSTRUCTION {
			return false, mkerr("expected instruction name after label")
		}
		fallthrough
	case INSTRUCTION:
		instrHolder := instruction{id: token.instruction}

		token, ok := <-a.tokens
		eol := false
		switch {
		case token.kind == WORD:
			instrHolder.target = strings.ToUpper(token.value)
		case token.kind == NUMBER:
			// TODO: we should check the number
			instrHolder.number, _ = strconv.Atoi(token.value)
		case token.kind == ERROR:
			return false, errors.New(token.value)
		case !ok:
			fallthrough
		case token.kind == NEWLINE:
			// This is fine, just assume zero
			eol = true
		}
		a.output = append(a.output, instrHolder)

		if !eol {
			token, ok := <-a.tokens
			switch {
			case !ok:
			case token.kind == NEWLINE:
			case token.kind == ERROR:
				return false, mkerr("%s", token.value)
			default:
				return false, mkerr("expected end of line")
			}
		}
	case NEWLINE:
		// Ignore empty lines
	case NUMBER:
		return false, mkerr("unexpected number")
	case ERROR:
		return false, mkerr("%s", token.value)
	}
	return true, nil
}

func Assemble(r io.Reader) (code []int16, err error) {
	a := assembler{tokens: make(chan token), labels: make(map[string]int)}
	go tokenize(r, a.tokens)

	for {
		if cont, err := a.step(); err != nil {
			return nil, err
		} else if !cont {
			break
		}
	}

	code = make([]int16, 100)
	for i, x := range a.output {
		if i >= len(code) {
			return nil, errors.New("program too long")
		}
		n := x.id
		switch {
		case x.id%100 != 0:
			// TODO: we could complain that arguments aren't allowed
		case len(x.target) != 0:
			// Resolve targets to code locations
			if resolved, ok := a.labels[x.target]; !ok {
				return nil, errors.New("unknown label")
			} else {
				n += resolved
			}
		default:
			n += x.number
		}
		code[i] = int16(n)
	}
	return code, nil
}