fx92-interpreter/lexer.py

201 lines
4.5 KiB
Python

# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
import math
import re
import enum
#---
# Token description
#---
@enum.unique
class T(enum.IntEnum):
# Basic commands
EOL = 0xF901
END = 0xF902
NOP = 0xF903
# Basic statements
FORWARD = 0xF905
ROTATE = 0xF906
ORIENT = 0xF907
GOTO = 0xF908
PENDOWN = 0xF909
PENUP = 0xF90A
SETVAR = 0xF90B
INPUT = 0xF90C
MESSAGE = 0xF90D
PRINT = 0xF90E
STYLE = 0xF90F
WAIT = 0xF910
# Flow control
REPEAT = 0xF911
REPEAT_END = 0xF912
WHILE = 0xF913
WHILE_END = 0xF914
IF = 0xF915
IF_END = 0xF916
IFELSE = 0xF917
ELSE = 0xF918
IFELSE_END = 0xF919
# Variable assignment
SETM = 0xFB10
SETA = 0xFB12
SETB = 0xFB13
SETC = 0xFB14
SETD = 0xFB15
SETE = 0xFB16
SETF = 0xFB17
SETX = 0xFB18
SETY = 0xFB19
# Miscellaneous
PARAM = 0x00
COLON = 0x23
QUEST = 0x25
LPAR = 0x60
RPAR = 0xD0
EQUAL = 0xA5
PLUS = 0xA6
MINUS = 0xA7
STAR = 0xA8
SLASH = 0xA9
BANG = 0xD8
# Tokens with parameters
CONST = -1
VAR = -2
REL = -3
class Token:
def __init__(self, type, *args):
"""Instanciate a new token."""
self.type = type
self.args = args
def __repr__(self):
"""Inambiguous token representation."""
try:
base = T(self.type).name
except ValueError:
base = f"<Token:{hex(self.type)}>"
if self.args:
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
else:
args = ""
return base + args
#---
# Lexer
#---
class ByteLexer:
"""
fx-92 SC+ language lexer with bytes() bitcode input.
"""
def __init__(self, hex):
"""Initialize the lexer with input data."""
self.hex = hex
self.rewind()
def rewind(self):
"""Restart lexing the same input."""
self.pos = 0
self.errors = 0
def lex(self):
"""Return the next token in the stream."""
h, p = self.hex, self.pos
if self.at_end():
return Token(T.END)
# 2-byte commands
if h[p] in [0xF9, 0xFB]:
# Stop if there is no trailing byte
if p >= len(h) - 1:
print(f"[lexer] Invalid trailing byte {hex(h[p])}")
p = len(h)
return Token(T.END)
# Return any value that is defined in the Token class
code = (h[p] << 8) | h[p+1]
self.pos += 2
try:
return Token(T(code))
except:
pass
# Also a few more values not in the Token class
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
if h[p] == 0xFB and h[p+1] in rels:
return Token(T.REL, rels[h[p+1]])
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
# Single-byte characters
self.pos += 1
code = h[p]
# Translate unary minus to normal minus
if code == 0xC0:
code = 0xA7
try:
return Token(T(code))
except:
pass
if code == 0x21:
return Token(T.CONST, math.e)
if code == 0x22:
return Token(T.CONST, math.pi)
# Constants
if code in range(0x30, 0x39+1):
# Never thought pointer arithmetic would beat Python. Grr!
re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
match = re.match(re_const, h[p:])
if match is not None:
text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
self.pos += len(text) - 1
f = float(text.decode('utf-8'))
if match[2] == "%":
f /= 100
return Token(T.CONST, f)
# Variables
if code in range(0x42, 0x47+1):
return Token(T.VAR, chr(h[p]-1))
if code == 0x40:
return Token(T.VAR, "M")
if code == 0x48:
return Token(T.VAR, "x")
if code == 0x49:
return Token(T.VAR, "y")
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
def at_end(self):
"""Check whether the whole input has been read."""
return self.pos >= len(self.hex)