lmc/assembler.go

package main

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"strconv"
	"strings"
)

const (
	WORD    = iota // [A-Za-z_-]+
	NUMBER         // [0-9]+
	NEWLINE        // \n
	ERROR          // Error
)

type location struct {
	line   int
	column int
}

type token struct {
	location location // Position of the token
	value    string   // Text content of the token
	kind     int      // Kind of the token
}

type tokenizer struct {
	location location      // Current position
	value    []byte        // Current token string
	reader   *bufio.Reader // Reader
	tokens   chan<- token  // Output token channel
}

// -----------------------------------------------------------------------------

func isSpace(c byte) bool {
	return c == ' ' || c == '\r' || c == '\t'
}

func isNumber(c byte) bool {
	return c >= '0' && c <= '9'
}

func isWordHead(c byte) bool {
	if c >= 'a' && c <= 'z' {
		c -= 32
	}
	return c >= 'a' && c <= 'z' || c == '_'
}

func isWordTail(c byte) bool {
	return isWordHead(c) || isNumber(c)
}

// -----------------------------------------------------------------------------

func (t *tokenizer) send(start location, kind int) {
	t.tokens <- token{start, string(t.value), kind}
	t.value = []byte{}
}

// XXX: the handling could probably be simplified by extending the "byte"
//   to also include a special value for io.EOF and other errors
func (t *tokenizer) peek() (byte, error) {
	buf, err := t.reader.Peek(1)
	return buf[0], err
}

func (t *tokenizer) eat() (byte, error) {
	c, err := t.reader.ReadByte()
	if err != nil {
		return 0, err
	}

	if c == '\n' {
		t.location.line++
		t.location.column = 0
	} else {
		t.location.column++
	}
	return c, nil
}

// -----------------------------------------------------------------------------

func (t *tokenizer) step() error {
	t.value = []byte{}
	c, err := t.peek()

	if err == io.EOF {
		return nil
	}
	if err != nil {
		return err
	}

	start := t.location
	switch {
	case isSpace(c):
		c, err = t.eat()
	case c == '\n':
		c, err = t.eat()
		t.value = append(t.value, c)

		t.send(start, NEWLINE)
	case isNumber(c):
		c, err = t.eat()
		t.value = append(t.value, c)

		for {
			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}

			if !isNumber(c) {
				break
			}

			c, err = t.eat()
			t.value = append(t.value, c)
		}
		t.send(start, NUMBER)
	case isWordHead(c):
		c, err = t.eat()
		t.value = append(t.value, c)

		for {
			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}

			if !isWordTail(c) {
				break
			}

			c, err = t.eat()
			t.value = append(t.value, c)
		}
		t.send(start, WORD)
	case c == '/':
		c, err = t.eat()
		t.value = append(t.value, c)

		c, err = t.peek()
		if err != nil {
			return err
		}

		if c != '/' {
			return errors.New("unrecognized input")
		}
		for {
			c, err = t.peek()
			if err == io.EOF {
				break
			}
			if err != nil {
				return err
			}

			if c == '\n' {
				break
			}
			t.eat()
		}
	default:
		return errors.New("unrecognized input")
	}
	return nil
}

func tokenize(r io.Reader, tokens chan<- token) {
	t := tokenizer{
		location: location{line: 1, column: 0},
		tokens:   tokens,
		reader:   bufio.NewReader(r),
	}
	for {
		err := t.step()
		if err == io.EOF {
			break
		}
		if err != nil {
			t.tokens <- token{t.location, fmt.Sprintf("line %d, column %d: %s",
				t.location.line, t.location.column, err.Error()), ERROR}
			break
		}
	}
	close(tokens)
}

// -----------------------------------------------------------------------------

const (
	IHALT = iota
	IADD
	ISUBTRACT
	ISTORE
	ILOAD
	IBRANCH
	IBRANCH_IF_ZERO
	IBRANCH_IF_POSITIVE
	IINPUT
	IOUTUT
	IDATA
)

var instructions = map[string]int{
	"HLT": IHALT,
	"COB": IHALT,
	"ADD": IADD,
	"SUB": ISUBTRACT,
	"STA": ISTORE,
	"LDA": ILOAD,
	"BRA": IBRANCH,
	"BRZ": IBRANCH_IF_ZERO,
	"BRP": IBRANCH_IF_POSITIVE,
	"INP": IINPUT,
	"OUT": IOUTUT,
	"DAT": IDATA,
}

type instruction struct {
	id     int
	target string
	number int
}

// -----------------------------------------------------------------------------

type assembler struct {
	tokens chan token
	output []instruction
	labels map[string]int
}

func (a *assembler) step() (bool, error) {
	token, ok := <-a.tokens
	if !ok {
		return false, nil
	}

	// TODO: add token location information to returned errors

	switch token.kind {
	case WORD:
		canonical := strings.ToUpper(token.value)
		instr, found := instructions[canonical]

		// Not found in the instruction list
		// Assume it is a label
		if !found {
			if _, dup := a.labels[canonical]; dup {
				return false, fmt.Errorf("Duplicate label: %s", canonical)
			}
			a.labels[canonical] = len(a.output)

			token, ok = <-a.tokens
			if !ok {
				return false, errors.New("Unexpected end of file")
			}
			if token.kind != WORD {
				return false, errors.New("Expected word")
			}

			// XXX: it might be better to classify this in the lexer
			canonical = strings.ToUpper(token.value)
			instr, found = instructions[canonical]
		}

		if !found {
			return false, fmt.Errorf("Unknown instruction: %s", canonical)
		}

		instrHolder := instruction{id: instr}
		token, ok := <-a.tokens
		if !ok {
			// This is fine, just assume zero
			break
		}

		switch token.kind {
		case WORD:
			instrHolder.target = strings.ToUpper(token.value)
		case NEWLINE:
			// This is fine, just assume zero
		case NUMBER:
			instrHolder.number, _ = strconv.Atoi(token.value)
		case ERROR:
			return false, errors.New(token.value)
		}
		a.output = append(a.output, instrHolder)
	case NEWLINE:
		// Ignore empty lines
	case NUMBER:
		return false, errors.New("Unexpected number")
	case ERROR:
		return false, errors.New(token.value)
	}
	return true, nil
}

func Assemble(r io.Reader) (code []int16, err error) {
	a := assembler{tokens: make(chan token)}
	go tokenize(r, a.tokens)

	for {
		cont, err := a.step()
		if err != nil {
			return nil, err
		}
		if !cont {
			break
		}
	}

	for _, x := range a.output {
		n := x.id * 100
		if len(x.target) != 0 {
			if resolved, ok := a.labels[x.target]; !ok {
				return nil, errors.New("Unknown label")
			} else {
				n += resolved
			}
		} else {
			n += x.number
		}
		code = append(code, int16(n))
	}
	return code, nil
}
Initial commit 2016-07-10 14:35:33 +02:00			`package main`

			`import (`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"bufio"`
Initial commit 2016-07-10 14:35:33 +02:00			`"errors"`
			`"fmt"`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"io"`
Initial commit 2016-07-10 14:35:33 +02:00			`"strconv"`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`"strings"`
Initial commit 2016-07-10 14:35:33 +02:00			`)`

			`const (`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`WORD = iota // [A-Za-z_-]+`
			`NUMBER // [0-9]+`
			`NEWLINE // \n`
			`ERROR // Error`
Initial commit 2016-07-10 14:35:33 +02:00			`)`

			`type location struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`line int`
Initial commit 2016-07-10 14:35:33 +02:00			`column int`
			`}`

			`type token struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location location // Position of the token`
			`value string // Text content of the token`
			`kind int // Kind of the token`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`type tokenizer struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location location // Current position`
			`value []byte // Current token string`
			`reader *bufio.Reader // Reader`
			`tokens chan<- token // Output token channel`
Initial commit 2016-07-10 14:35:33 +02:00			`}`

			`// -----------------------------------------------------------------------------`

			`func isSpace(c byte) bool {`
			`return c == ' ' \|\| c == '\r' \|\| c == '\t'`
			`}`

			`func isNumber(c byte) bool {`
			`return c >= '0' && c <= '9'`
			`}`

			`func isWordHead(c byte) bool {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if c >= 'a' && c <= 'z' {`
			`c -= 32`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`return c >= 'a' && c <= 'z' \|\| c == '_'`
			`}`

			`func isWordTail(c byte) bool {`
			`return isWordHead(c) \|\| isNumber(c)`
			`}`

			`// -----------------------------------------------------------------------------`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`func (t *tokenizer) send(start location, kind int) {`
			`t.tokens <- token{start, string(t.value), kind}`
Initial commit 2016-07-10 14:35:33 +02:00			`t.value = []byte{}`
			`}`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`// XXX: the handling could probably be simplified by extending the "byte"`
			`// to also include a special value for io.EOF and other errors`
Initial commit 2016-07-10 14:35:33 +02:00			`func (t *tokenizer) peek() (byte, error) {`
			`buf, err := t.reader.Peek(1)`
			`return buf[0], err`
			`}`

			`func (t *tokenizer) eat() (byte, error) {`
			`c, err := t.reader.ReadByte()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err != nil {`
			`return 0, err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`if c == '\n' {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.location.line++`
			`t.location.column = 0`
Initial commit 2016-07-10 14:35:33 +02:00			`} else {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.location.column++`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`return c, nil`
			`}`

			`// -----------------------------------------------------------------------------`

			`func (t *tokenizer) step() error {`
			`t.value = []byte{}`
			`c, err := t.peek()`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`return nil`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`start := t.location`
Initial commit 2016-07-10 14:35:33 +02:00			`switch {`
			`case isSpace(c):`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`c, err = t.eat()`
Initial commit 2016-07-10 14:35:33 +02:00			`case c == '\n':`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, NEWLINE)`
Initial commit 2016-07-10 14:35:33 +02:00			`case isNumber(c):`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`

			`for {`
			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if !isNumber(c) {`
			`break`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`c, err = t.eat()`
Initial commit 2016-07-10 14:35:33 +02:00			`t.value = append(t.value, c)`
			`}`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, NUMBER)`
Initial commit 2016-07-10 14:35:33 +02:00			`case isWordHead(c):`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`

			`for {`
			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if !isWordTail(c) {`
			`break`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`c, err = t.eat()`
Initial commit 2016-07-10 14:35:33 +02:00			`t.value = append(t.value, c)`
			`}`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.send(start, WORD)`
Initial commit 2016-07-10 14:35:33 +02:00			`case c == '/':`
			`c, err = t.eat()`
			`t.value = append(t.value, c)`

			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`if c != '/' {`
			`return errors.New("unrecognized input")`
			`}`
			`for {`
			`c, err = t.peek()`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
			`return err`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if c == '\n' {`
			`break`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00			`t.eat()`
			`}`
			`default:`
			`return errors.New("unrecognized input")`
			`}`
			`return nil`
			`}`

			`func tokenize(r io.Reader, tokens chan<- token) {`
			`t := tokenizer{`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`location: location{line: 1, column: 0},`
			`tokens: tokens,`
			`reader: bufio.NewReader(r),`
Initial commit 2016-07-10 14:35:33 +02:00			`}`
			`for {`
			`err := t.step()`
			`if err == io.EOF {`
			`break`
			`}`
			`if err != nil {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`t.tokens <- token{t.location, fmt.Sprintf("line %d, column %d: %s",`
			`t.location.line, t.location.column, err.Error()), ERROR}`
Initial commit 2016-07-10 14:35:33 +02:00			`break`
			`}`
			`}`
			`close(tokens)`
			`}`

			`// -----------------------------------------------------------------------------`

			`const (`
			`IHALT = iota`
			`IADD`
			`ISUBTRACT`
			`ISTORE`
			`ILOAD`
			`IBRANCH`
			`IBRANCH_IF_ZERO`
			`IBRANCH_IF_POSITIVE`
			`IINPUT`
			`IOUTUT`
			`IDATA`
			`)`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`var instructions = map[string]int{`
Initial commit 2016-07-10 14:35:33 +02:00			`"HLT": IHALT,`
			`"COB": IHALT,`
			`"ADD": IADD,`
			`"SUB": ISUBTRACT,`
			`"STA": ISTORE,`
			`"LDA": ILOAD,`
			`"BRA": IBRANCH,`
			`"BRZ": IBRANCH_IF_ZERO,`
			`"BRP": IBRANCH_IF_POSITIVE,`
			`"INP": IINPUT,`
			`"OUT": IOUTUT,`
			`"DAT": IDATA,`
			`}`

			`type instruction struct {`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`id int`
Initial commit 2016-07-10 14:35:33 +02:00			`target string`
			`number int`
			`}`

			`// -----------------------------------------------------------------------------`

			`type assembler struct {`
			`tokens chan token`
			`output []instruction`
			`labels map[string]int`
			`}`

			`func (a *assembler) step() (bool, error) {`
			`token, ok := <-a.tokens`
Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`if !ok {`
			`return false, nil`
			`}`
Initial commit 2016-07-10 14:35:33 +02:00
			`// TODO: add token location information to returned errors`

			`switch token.kind {`
			`case WORD:`
			`canonical := strings.ToUpper(token.value)`
			`instr, found := instructions[canonical]`

			`// Not found in the instruction list`
			`// Assume it is a label`
			`if !found {`
			`if _, dup := a.labels[canonical]; dup {`
			`return false, fmt.Errorf("Duplicate label: %s", canonical)`
			`}`
			`a.labels[canonical] = len(a.output)`

			`token, ok = <-a.tokens`
			`if !ok {`
			`return false, errors.New("Unexpected end of file")`
			`}`
			`if token.kind != WORD {`
			`return false, errors.New("Expected word")`
			`}`

			`// XXX: it might be better to classify this in the lexer`
			`canonical = strings.ToUpper(token.value)`
			`instr, found = instructions[canonical]`
			`}`

			`if !found {`
			`return false, fmt.Errorf("Unknown instruction: %s", canonical)`
			`}`

			`instrHolder := instruction{id: instr}`
			`token, ok := <-a.tokens`
			`if !ok {`
			`// This is fine, just assume zero`
			`break`
			`}`

			`switch token.kind {`
			`case WORD:`
			`instrHolder.target = strings.ToUpper(token.value)`
			`case NEWLINE:`
			`// This is fine, just assume zero`
			`case NUMBER:`
			`instrHolder.number, _ = strconv.Atoi(token.value)`
			`case ERROR:`
			`return false, errors.New(token.value)`
			`}`
			`a.output = append(a.output, instrHolder)`
			`case NEWLINE:`
			`// Ignore empty lines`
			`case NUMBER:`
			`return false, errors.New("Unexpected number")`
			`case ERROR:`
			`return false, errors.New(token.value)`
			`}`
			`return true, nil`
			`}`

			`func Assemble(r io.Reader) (code []int16, err error) {`
			`a := assembler{tokens: make(chan token)}`
			`go tokenize(r, a.tokens)`

			`for {`
			`cont, err := a.step()`
			`if err != nil {`
			`return nil, err`
			`}`
			`if !cont {`
			`break`
			`}`
			`}`

Arbitrary checkpoint Apparently a gofmt happened in the meantime. 2016-10-19 20:20:43 +02:00			`for _, x := range a.output {`
Initial commit 2016-07-10 14:35:33 +02:00			`n := x.id * 100`
			`if len(x.target) != 0 {`
			`if resolved, ok := a.labels[x.target]; !ok {`
			`return nil, errors.New("Unknown label")`
			`} else {`
			`n += resolved`
			`}`
			`} else {`
			`n += x.number`
			`}`
			`code = append(code, int16(n))`
			`}`
			`return code, nil`
			`}`