fx92-interpreter/lexer.py

# fx-92 Scientifique Collège+ language interpreter: Lexical analysis

import math
import re
import enum

#---
# Token description
#---

@enum.unique
class T(enum.IntEnum):
    # Basic commands
    EOL = 0xF901
    END = 0xF902
    NOP = 0xF903

    # Basic statements
    FORWARD = 0xF905
    ROTATE  = 0xF906
    ORIENT  = 0xF907
    GOTO    = 0xF908
    PENDOWN = 0xF909
    PENUP   = 0xF90A
    SETVAR  = 0xF90B
    INPUT   = 0xF90C
    MESSAGE = 0xF90D
    PRINT   = 0xF90E
    STYLE   = 0xF90F
    WAIT    = 0xF910

    # Flow control
    REPEAT     = 0xF911
    REPEAT_END = 0xF912
    WHILE      = 0xF913
    WHILE_END  = 0xF914
    IF         = 0xF915
    IF_END     = 0xF916
    IFELSE     = 0xF917
    ELSE       = 0xF918
    IFELSE_END = 0xF919

    # Variable assignment
    SETM = 0xFB10
    SETA = 0xFB12
    SETB = 0xFB13
    SETC = 0xFB14
    SETD = 0xFB15
    SETE = 0xFB16
    SETF = 0xFB17
    SETX = 0xFB18
    SETY = 0xFB19

    # Miscellaneous
    PARAM  = 0x00
    COLON  = 0x23
    QUEST  = 0x25
    LPAR   = 0x60
    RPAR   = 0xD0
    EQUAL  = 0xA5
    PLUS   = 0xA6
    MINUS  = 0xA7
    STAR   = 0xA8
    SLASH  = 0xA9
    BANG   = 0xD8

    # Tokens with parameters
    CONST = -1
    VAR   = -2
    REL   = -3

class Token:
    def __init__(self, type, *args):
        """Instanciate a new token."""
        self.type = type
        self.args = args

    def __repr__(self):
        """Inambiguous token representation."""
        try:
            base = T(self.type).name
        except ValueError:
            base = f"<Token:{hex(self.type)}>"

        if self.args:
            args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
        else:
            args = ""

        return base + args

#---
# Lexer
#---

class ByteLexer:
    """
    fx-92 SC+ language lexer with bytes() bitcode input.
    """

    def __init__(self, hex):
        """Initialize the lexer with input data."""
        self.hex = hex
        self.rewind()

    def rewind(self):
        """Restart lexing the same input."""
        self.pos = 0
        self.errors = 0

    def lex(self):
        """Return the next token in the stream."""
        h, p = self.hex, self.pos

        if self.at_end():
            return Token(T.END)

        # 2-byte commands

        if h[p] in [0xF9, 0xFB]:
            # Stop if there is no trailing byte
            if p >= len(h) - 1:
                print(f"[lexer] Invalid trailing byte {hex(h[p])}")
                p = len(h)
                return Token(T.END)

            # Return any value that is defined in the Token class
            code = (h[p] << 8) | h[p+1]
            self.pos += 2

            try:
                return Token(T(code))
            except:
                pass

            # Also a few more values not in the Token class
            rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
            if h[p] == 0xFB and h[p+1] in rels:
                return Token(T.REL, rels[h[p+1]])

            print(f"[lexer] Unknown opcode {hex(code)}")
            self.errors += 1

            # Try to read another token
            return self.lex()

        # Single-byte characters

        self.pos += 1
        code = h[p]

        # Translate unary minus to normal minus
        if code == 0xC0:
            code = 0xA7

        try:
            return Token(T(code))
        except:
            pass

        if code == 0x21:
            return Token(T.CONST, math.e)
        if code == 0x22:
            return Token(T.CONST, math.pi)

        # Constants
        if code in range(0x30, 0x39+1):
            # Never thought pointer arithmetic would beat Python. Grr!
            re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
            match = re.match(re_const, h[p:])

            if match is not None:
                text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
                self.pos += len(text) - 1

                f = float(text.decode('utf-8'))
                if match[2] == "%":
                    f /= 100
                return Token(T.CONST, f)

        # Variables
        if code in range(0x42, 0x47+1):
            return Token(T.VAR, chr(h[p]-1))
        if code == 0x40:
            return Token(T.VAR, "M")
        if code == 0x48:
            return Token(T.VAR, "x")
        if code == 0x49:
            return Token(T.VAR, "y")

        print(f"[lexer] Unknown opcode {hex(code)}")
        self.errors += 1

        # Try to read another token
        return self.lex()

    def at_end(self):
        """Check whether the whole input has been read."""
        return self.pos >= len(self.hex)