Lephe 6151144d0a
initial commit: most of the lexing and parsing work
The program is currently able to lex most useful tokens, and parse
constructs associated with them on simple examples.

Unit tests are still missing to formally ensure everything's right.
2019-09-30 11:29:05 +02:00

201 lines
4.5 KiB

# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
import math
import re
import enum
# Token description
class T(enum.IntEnum):
# Basic commands
EOL = 0xF901
END = 0xF902
NOP = 0xF903
# Basic statements
FORWARD = 0xF905
ROTATE = 0xF906
ORIENT = 0xF907
GOTO = 0xF908
PENDOWN = 0xF909
PENUP = 0xF90A
INPUT = 0xF90C
PRINT = 0xF90E
STYLE = 0xF90F
WAIT = 0xF910
# Flow control
REPEAT = 0xF911
WHILE = 0xF913
WHILE_END = 0xF914
IF = 0xF915
IF_END = 0xF916
IFELSE = 0xF917
ELSE = 0xF918
# Variable assignment
SETM = 0xFB10
SETA = 0xFB12
SETB = 0xFB13
SETC = 0xFB14
SETD = 0xFB15
SETE = 0xFB16
SETF = 0xFB17
SETX = 0xFB18
SETY = 0xFB19
# Miscellaneous
PARAM = 0x00
COLON = 0x23
QUEST = 0x25
LPAR = 0x60
RPAR = 0xD0
EQUAL = 0xA5
PLUS = 0xA6
MINUS = 0xA7
STAR = 0xA8
SLASH = 0xA9
BANG = 0xD8
# Tokens with parameters
CONST = -1
VAR = -2
REL = -3
class Token:
def __init__(self, type, *args):
"""Instanciate a new token."""
self.type = type
self.args = args
def __repr__(self):
"""Inambiguous token representation."""
base = T(self.type).name
except ValueError:
base = f"<Token:{hex(self.type)}>"
if self.args:
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
args = ""
return base + args
# Lexer
class ByteLexer:
fx-92 SC+ language lexer with bytes() bitcode input.
def __init__(self, hex):
"""Initialize the lexer with input data."""
self.hex = hex
def rewind(self):
"""Restart lexing the same input."""
self.pos = 0
self.errors = 0
def lex(self):
"""Return the next token in the stream."""
h, p = self.hex, self.pos
if self.at_end():
return Token(T.END)
# 2-byte commands
if h[p] in [0xF9, 0xFB]:
# Stop if there is no trailing byte
if p >= len(h) - 1:
print(f"[lexer] Invalid trailing byte {hex(h[p])}")
p = len(h)
return Token(T.END)
# Return any value that is defined in the Token class
code = (h[p] << 8) | h[p+1]
self.pos += 2
return Token(T(code))
# Also a few more values not in the Token class
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
if h[p] == 0xFB and h[p+1] in rels:
return Token(T.REL, rels[h[p+1]])
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
# Single-byte characters
self.pos += 1
code = h[p]
# Translate unary minus to normal minus
if code == 0xC0:
code = 0xA7
return Token(T(code))
if code == 0x21:
return Token(T.CONST, math.e)
if code == 0x22:
return Token(T.CONST, math.pi)
# Constants
if code in range(0x30, 0x39+1):
# Never thought pointer arithmetic would beat Python. Grr!
re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
match = re.match(re_const, h[p:])
if match is not None:
text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
self.pos += len(text) - 1
f = float(text.decode('utf-8'))
if match[2] == "%":
f /= 100
return Token(T.CONST, f)
# Variables
if code in range(0x42, 0x47+1):
return Token(T.VAR, chr(h[p]-1))
if code == 0x40:
return Token(T.VAR, "M")
if code == 0x48:
return Token(T.VAR, "x")
if code == 0x49:
return Token(T.VAR, "y")
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
def at_end(self):
"""Check whether the whole input has been read."""
return self.pos >= len(self.hex)