lmc/assembler.go

package main

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"strconv"
	"strings"
)

const (
	WORD    = iota // [A-Za-z_-]+
	NUMBER         // [0-9]+
	NEWLINE        // \n
	ERROR          // Error
)

type location struct {
	line   int
	column int
}

type token struct {
	location location // Position of the token
	value    string   // Text content of the token
	kind     int      // Kind of the token
}

type tokenizer struct {
	location location      // Current position
	value    []byte        // Current token string
	reader   *bufio.Reader // Reader
	tokens   chan<- token  // Output token channel
}

// -----------------------------------------------------------------------------

func isSpace(c byte) bool {
	return c == ' ' || c == '\r' || c == '\t'
}

func isNumber(c byte) bool {
	return c >= '0' && c <= '9'
}

func isWordHead(c byte) bool {
	if c >= 'a' && c <= 'z' {
		c -= 32
	}
	return c >= 'A' && c <= 'Z' || c == '_'
}

func isWordTail(c byte) bool {
	return isWordHead(c) || isNumber(c)
}

// -----------------------------------------------------------------------------

func (t *tokenizer) send(start location, kind int) {
	t.tokens <- token{start, string(t.value), kind}
	t.value = []byte{}
}

// XXX: the handling could probably be simplified by extending the "byte"
//   to also include a special out-of-band value for errors
func (t *tokenizer) peek() (byte, error) {
	buf, err := t.reader.Peek(1)
	if err != nil {
		return '?', err
	}
	return buf[0], err
}

func (t *tokenizer) eat() (byte, error) {
	c, err := t.reader.ReadByte()
	if err != nil {
		return 0, err
	}

	if c == '\n' {
		t.location.line++
		t.location.column = 0
	} else {
		t.location.column++
	}
	return c, nil
}

// -----------------------------------------------------------------------------

func (t *tokenizer) step() error {
	start := t.location
	t.value = []byte{}

	c, err := t.peek()
	if err != nil {
		return err
	}

	switch {
	case isSpace(c):
		c, err = t.eat()
	case c == '\n':
		c, err = t.eat()
		t.value = append(t.value, c)

		t.send(start, NEWLINE)
	case isNumber(c):
		for isNumber(c) {
			c, err = t.eat()
			t.value = append(t.value, c)

			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
		t.send(start, NUMBER)
	case isWordHead(c):
		for isWordTail(c) {
			c, err = t.eat()
			t.value = append(t.value, c)

			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
		t.send(start, WORD)
	case c == '/':
		c, err = t.eat()
		c, err = t.peek()
		if err == io.EOF {
			return errors.New("unexpected EOF")
		}
		if err != nil {
			return err
		}

		if c != '/' {
			return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))
		}
		for c != '\n' {
			c, err = t.eat()
			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}
		}
	default:
		return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))
	}
	return nil
}

func tokenize(r io.Reader, tokens chan<- token) {
	t := tokenizer{
		location: location{line: 1, column: 0},
		tokens:   tokens,
		reader:   bufio.NewReader(r),
	}
	for {
		err := t.step()
		if err == io.EOF {
			break
		}
		if err != nil {
			t.tokens <- token{t.location, fmt.Sprintf("line %d, column %d: %s",
				t.location.line, t.location.column, err.Error()), ERROR}
			break
		}
	}
	close(tokens)
}

// -----------------------------------------------------------------------------

const (
	IHALT = iota
	IADD
	ISUBTRACT
	ISTORE
	ILOAD
	_
	IBRANCH
	IBRANCH_IF_ZERO
	IBRANCH_IF_POSITIVE
	IINPUT
	IOUTPUT
	IDATA
)

var instructions = map[string]int{
	"HLT": IHALT,
	"COB": IHALT,
	"ADD": IADD,
	"SUB": ISUBTRACT,
	"STA": ISTORE,
	"LDA": ILOAD,
	"BRA": IBRANCH,
	"BRZ": IBRANCH_IF_ZERO,
	"BRP": IBRANCH_IF_POSITIVE,
	"INP": IINPUT,
	"OUT": IOUTPUT,
	"DAT": IDATA,
}

type instruction struct {
	id     int    // What instruction this is
	target string // Label name
	number int    // Immediate value
}

// -----------------------------------------------------------------------------

type assembler struct {
	tokens chan token     // Where tokens come from
	output []instruction  // The assembled program
	labels map[string]int // Addresses of labels
}

func (a *assembler) step() (bool, error) {
	token, ok := <-a.tokens
	if !ok {
		return false, nil
	}

	// TODO: add token location information to returned errors

	switch token.kind {
	case WORD:
		canonical := strings.ToUpper(token.value)
		instr, found := instructions[canonical]

		// Not found in the instruction list
		// Assume it is a label
		if !found {
			if _, dup := a.labels[canonical]; dup {
				return false, fmt.Errorf("Duplicate label: %s", canonical)
			}
			a.labels[canonical] = len(a.output)

			token, ok = <-a.tokens
			if !ok {
				return false, errors.New("Unexpected end of file")
			}
			if token.kind != WORD {
				return false, errors.New("Expected word")
			}

			// XXX: it might be better to classify this in the lexer
			canonical = strings.ToUpper(token.value)
			instr, found = instructions[canonical]
		}
		if !found {
			return false, fmt.Errorf("Unknown instruction: %s", canonical)
		}

		instrHolder := instruction{id: instr}

		token, ok := <-a.tokens
		eol := false
		switch {
		case token.kind == WORD:
			instrHolder.target = strings.ToUpper(token.value)
		case token.kind == NUMBER:
			instrHolder.number, _ = strconv.Atoi(token.value)
		case token.kind == ERROR:
			return false, errors.New(token.value)
		case !ok:
			fallthrough
		case token.kind == NEWLINE:
			// This is fine, just assume zero
			eol = true
		}
		a.output = append(a.output, instrHolder)

		if !eol {
			token, ok := <-a.tokens
			switch {
			case !ok:
				break
			case token.kind == NEWLINE:
				break
			case token.kind == ERROR:
				return false, errors.New(token.value)
			default:
				return false, errors.New("Expected end of line")
			}
		}
	case NEWLINE:
		// Ignore empty lines
	case NUMBER:
		return false, errors.New("Unexpected number")
	case ERROR:
		return false, errors.New(token.value)
	}
	return true, nil
}

func Assemble(r io.Reader) (code []int16, err error) {
	a := assembler{tokens: make(chan token), labels: make(map[string]int)}
	go tokenize(r, a.tokens)

	for {
		cont, err := a.step()
		if err != nil {
			return nil, err
		}
		if !cont {
			break
		}
	}

	code = make([]int16, 100)
	for i, x := range a.output {
		if i >= len(code) {
			return nil, errors.New("Program too long")
		}
		n := x.id * 100
		// XXX: this also stinks
		if x.id == IDATA {
			n = 0
		}
		// XXX: we should be able to handle the strange INP and OUT better
		switch {
		case x.id == IINPUT:
			n = 901
		case x.id == IOUTPUT:
			n = 902
		case len(x.target) != 0:
			// Resolve targets to code locations
			if resolved, ok := a.labels[x.target]; !ok {
				return nil, errors.New("Unknown label")
			} else {
				n += resolved
			}
		default:
			n += x.number
		}
		code[i] = int16(n)
	}
	return code, nil
}
Initial commit 2016-07-10 14:35:33 +02:00			`package main`

			`import (`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"bufio"`
Initial commit 2016-07-10 14:35:33 +02:00			`"errors"`
			`"fmt"`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"io"`
Initial commit 2016-07-10 14:35:33 +02:00			`"strconv"`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"strings"`
Initial commit 2016-07-10 14:35:33 +02:00			`)`

			`const (`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`WORD = iota // [A-Za-z_-]+`
			`NUMBER // [0-9]+`
			`NEWLINE // \n`
			`ERROR // Error`
Initial commit 2016-07-10 14:35:33 +02:00			`)`

			`type location struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`line int`
Initial commit 2016-07-10 14:35:33 +02:00			`column int`
			`}`

			`type token struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location location // Position of the token`
			`value string // Text content of the token`
			`kind int // Kind of the token`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`type tokenizer struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location location // Current position`
			`value []byte // Current token string`
			`reader *bufio.Reader // Reader`
			`tokens chan<- token // Output token channel`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`// -----------------------------------------------------------------------------`

			`func isSpace(c byte) bool {`
			`return c == ' ' \|\| c == '\r' \|\| c == '\t'`
			`}`

			`func isNumber(c byte) bool {`
			`return c >= '0' && c <= '9'`
			`}`

			`func isWordHead(c byte) bool {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if c >= 'a' && c <= 'z' {`
			`c -= 32`
			`}`
Get it working 2016-10-20 00:12:24 +02:00			`return c >= 'A' && c <= 'Z' \|\| c == '_'`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`func isWordTail(c byte) bool {`
			`return isWordHead(c) \|\| isNumber(c)`
			`}`

			`// -----------------------------------------------------------------------------`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`func (t *tokenizer) send(start location, kind int) {`
			`t.tokens <- token{start, string(t.value), kind}`
Initial commit 2016-07-10 14:35:33 +02:00			`t.value = []byte{}`
			`}`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`// XXX: the handling could probably be simplified by extending the "byte"`
Get it working 2016-10-20 00:12:24 +02:00			`// to also include a special out-of-band value for errors`
Initial commit 2016-07-10 14:35:33 +02:00			`func (t *tokenizer) peek() (byte, error) {`
			`buf, err := t.reader.Peek(1)`
Get it working 2016-10-20 00:12:24 +02:00			`if err != nil {`
			`return '?', err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`return buf[0], err`
			`}`

			`func (t *tokenizer) eat() (byte, error) {`
			`c, err := t.reader.ReadByte()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err != nil {`
			`return 0, err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`if c == '\n' {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.location.line++`
			`t.location.column = 0`
Initial commit 2016-07-10 14:35:33 +02:00			`} else {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.location.column++`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`return c, nil`
			`}`

			`// -----------------------------------------------------------------------------`

			`func (t *tokenizer) step() error {`
Get it working 2016-10-20 00:12:24 +02:00			`start := t.location`
Initial commit 2016-07-10 14:35:33 +02:00			`t.value = []byte{}`

Get it working 2016-10-20 00:12:24 +02:00			`c, err := t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`switch {`
			`case isSpace(c):`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`c, err = t.eat()`
Initial commit 2016-07-10 14:35:33 +02:00			`case c == '\n':`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, NEWLINE)`
Initial commit 2016-07-10 14:35:33 +02:00			`case isNumber(c):`
Get it working 2016-10-20 00:12:24 +02:00			`for isNumber(c) {`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`
Initial commit 2016-07-10 14:35:33 +02:00
			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, NUMBER)`
Initial commit 2016-07-10 14:35:33 +02:00			`case isWordHead(c):`
Get it working 2016-10-20 00:12:24 +02:00			`for isWordTail(c) {`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`
Initial commit 2016-07-10 14:35:33 +02:00
			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, WORD)`
Initial commit 2016-07-10 14:35:33 +02:00			`case c == '/':`
			`c, err = t.eat()`
			`c, err = t.peek()`
Get it working 2016-10-20 00:12:24 +02:00			`if err == io.EOF {`
			`return errors.New("unexpected EOF")`
			`}`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`if c != '/' {`
Get it working 2016-10-20 00:12:24 +02:00			`return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
Get it working 2016-10-20 00:12:24 +02:00			`for c != '\n' {`
			`c, err = t.eat()`
Initial commit 2016-07-10 14:35:33 +02:00			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`default:`
Get it working 2016-10-20 00:12:24 +02:00			`return errors.New(fmt.Sprintf("unrecognized input: '%c'", c))`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`return nil`
			`}`

			`func tokenize(r io.Reader, tokens chan<- token) {`
			`t := tokenizer{`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location: location{line: 1, column: 0},`
			`tokens: tokens,`
			`reader: bufio.NewReader(r),`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`for {`
			`err := t.step()`
			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.tokens <- token{t.location, fmt.Sprintf("line %d, column %d: %s",`
			`t.location.line, t.location.column, err.Error()), ERROR}`
Initial commit 2016-07-10 14:35:33 +02:00			`break`
			`}`
			`}`
			`close(tokens)`
			`}`

			`// -----------------------------------------------------------------------------`

			`const (`
			`IHALT = iota`
			`IADD`
			`ISUBTRACT`
			`ISTORE`
			`ILOAD`
Get it working 2016-10-20 00:12:24 +02:00			`_`
Initial commit 2016-07-10 14:35:33 +02:00			`IBRANCH`
			`IBRANCH_IF_ZERO`
			`IBRANCH_IF_POSITIVE`
			`IINPUT`
Get it working 2016-10-20 00:12:24 +02:00			`IOUTPUT`
Initial commit 2016-07-10 14:35:33 +02:00			`IDATA`
			`)`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`var instructions = map[string]int{`
Initial commit 2016-07-10 14:35:33 +02:00			`"HLT": IHALT,`
			`"COB": IHALT,`
			`"ADD": IADD,`
			`"SUB": ISUBTRACT,`
			`"STA": ISTORE,`
			`"LDA": ILOAD,`
			`"BRA": IBRANCH,`
			`"BRZ": IBRANCH_IF_ZERO,`
			`"BRP": IBRANCH_IF_POSITIVE,`
			`"INP": IINPUT,`
Get it working 2016-10-20 00:12:24 +02:00			`"OUT": IOUTPUT,`
Initial commit 2016-07-10 14:35:33 +02:00			`"DAT": IDATA,`
			`}`

			`type instruction struct {`
Get it working 2016-10-20 00:12:24 +02:00			`id int // What instruction this is`
			`target string // Label name`
			`number int // Immediate value`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`// -----------------------------------------------------------------------------`

			`type assembler struct {`
Get it working 2016-10-20 00:12:24 +02:00			`tokens chan token // Where tokens come from`
			`output []instruction // The assembled program`
			`labels map[string]int // Addresses of labels`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`func (a *assembler) step() (bool, error) {`
			`token, ok := <-a.tokens`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if !ok {`
			`return false, nil`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`// TODO: add token location information to returned errors`

			`switch token.kind {`
			`case WORD:`
			`canonical := strings.ToUpper(token.value)`
			`instr, found := instructions[canonical]`

			`// Not found in the instruction list`
			`// Assume it is a label`
			`if !found {`
			`if _, dup := a.labels[canonical]; dup {`
			`return false, fmt.Errorf("Duplicate label: %s", canonical)`
			`}`
			`a.labels[canonical] = len(a.output)`

			`token, ok = <-a.tokens`
			`if !ok {`
			`return false, errors.New("Unexpected end of file")`
			`}`
			`if token.kind != WORD {`
			`return false, errors.New("Expected word")`
			`}`

			`// XXX: it might be better to classify this in the lexer`
			`canonical = strings.ToUpper(token.value)`
			`instr, found = instructions[canonical]`
			`}`
			`if !found {`
			`return false, fmt.Errorf("Unknown instruction: %s", canonical)`
			`}`

			`instrHolder := instruction{id: instr}`

Get it working 2016-10-20 00:12:24 +02:00			`token, ok := <-a.tokens`
			`eol := false`
			`switch {`
			`case token.kind == WORD:`
Initial commit 2016-07-10 14:35:33 +02:00			`instrHolder.target = strings.ToUpper(token.value)`
Get it working 2016-10-20 00:12:24 +02:00			`case token.kind == NUMBER:`
Initial commit 2016-07-10 14:35:33 +02:00			`instrHolder.number, _ = strconv.Atoi(token.value)`
Get it working 2016-10-20 00:12:24 +02:00			`case token.kind == ERROR:`
Initial commit 2016-07-10 14:35:33 +02:00			`return false, errors.New(token.value)`
Get it working 2016-10-20 00:12:24 +02:00			`case !ok:`
			`fallthrough`
			`case token.kind == NEWLINE:`
			`// This is fine, just assume zero`
			`eol = true`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`a.output = append(a.output, instrHolder)`
Get it working 2016-10-20 00:12:24 +02:00
			`if !eol {`
			`token, ok := <-a.tokens`
			`switch {`
			`case !ok:`
			`break`
			`case token.kind == NEWLINE:`
			`break`
			`case token.kind == ERROR:`
			`return false, errors.New(token.value)`
			`default:`
			`return false, errors.New("Expected end of line")`
			`}`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`case NEWLINE:`
			`// Ignore empty lines`
			`case NUMBER:`
			`return false, errors.New("Unexpected number")`
			`case ERROR:`
			`return false, errors.New(token.value)`
			`}`
			`return true, nil`
			`}`

			`func Assemble(r io.Reader) (code []int16, err error) {`
Get it working 2016-10-20 00:12:24 +02:00			`a := assembler{tokens: make(chan token), labels: make(map[string]int)}`
Initial commit 2016-07-10 14:35:33 +02:00			`go tokenize(r, a.tokens)`

			`for {`
			`cont, err := a.step()`
			`if err != nil {`
			`return nil, err`
			`}`
			`if !cont {`
			`break`
			`}`
			`}`

Get it working 2016-10-20 00:12:24 +02:00			`code = make([]int16, 100)`
			`for i, x := range a.output {`
			`if i >= len(code) {`
			`return nil, errors.New("Program too long")`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`n := x.id * 100`
Get it working 2016-10-20 00:12:24 +02:00			`// XXX: this also stinks`
			`if x.id == IDATA {`
			`n = 0`
			`}`
			`// XXX: we should be able to handle the strange INP and OUT better`
			`switch {`
			`case x.id == IINPUT:`
			`n = 901`
			`case x.id == IOUTPUT:`
			`n = 902`
			`case len(x.target) != 0:`
			`// Resolve targets to code locations`
Initial commit 2016-07-10 14:35:33 +02:00			`if resolved, ok := a.labels[x.target]; !ok {`
			`return nil, errors.New("Unknown label")`
			`} else {`
			`n += resolved`
			`}`
Get it working 2016-10-20 00:12:24 +02:00			`default:`
Initial commit 2016-07-10 14:35:33 +02:00			`n += x.number`
			`}`
Get it working 2016-10-20 00:12:24 +02:00			`code[i] = int16(n)`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`return code, nil`
			`}`