Lephe
6151144d0a
The program is currently able to lex most useful tokens, and parse constructs associated with them on simple examples. Unit tests are still missing to formally ensure everything's right.
201 lines
4.5 KiB
Python
201 lines
4.5 KiB
Python
# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
|
|
|
|
import math
|
|
import re
|
|
import enum
|
|
|
|
#---
|
|
# Token description
|
|
#---
|
|
|
|
@enum.unique
|
|
class T(enum.IntEnum):
|
|
# Basic commands
|
|
EOL = 0xF901
|
|
END = 0xF902
|
|
NOP = 0xF903
|
|
|
|
# Basic statements
|
|
FORWARD = 0xF905
|
|
ROTATE = 0xF906
|
|
ORIENT = 0xF907
|
|
GOTO = 0xF908
|
|
PENDOWN = 0xF909
|
|
PENUP = 0xF90A
|
|
SETVAR = 0xF90B
|
|
INPUT = 0xF90C
|
|
MESSAGE = 0xF90D
|
|
PRINT = 0xF90E
|
|
STYLE = 0xF90F
|
|
WAIT = 0xF910
|
|
|
|
# Flow control
|
|
REPEAT = 0xF911
|
|
REPEAT_END = 0xF912
|
|
WHILE = 0xF913
|
|
WHILE_END = 0xF914
|
|
IF = 0xF915
|
|
IF_END = 0xF916
|
|
IFELSE = 0xF917
|
|
ELSE = 0xF918
|
|
IFELSE_END = 0xF919
|
|
|
|
# Variable assignment
|
|
SETM = 0xFB10
|
|
SETA = 0xFB12
|
|
SETB = 0xFB13
|
|
SETC = 0xFB14
|
|
SETD = 0xFB15
|
|
SETE = 0xFB16
|
|
SETF = 0xFB17
|
|
SETX = 0xFB18
|
|
SETY = 0xFB19
|
|
|
|
# Miscellaneous
|
|
PARAM = 0x00
|
|
COLON = 0x23
|
|
QUEST = 0x25
|
|
LPAR = 0x60
|
|
RPAR = 0xD0
|
|
EQUAL = 0xA5
|
|
PLUS = 0xA6
|
|
MINUS = 0xA7
|
|
STAR = 0xA8
|
|
SLASH = 0xA9
|
|
BANG = 0xD8
|
|
|
|
# Tokens with parameters
|
|
CONST = -1
|
|
VAR = -2
|
|
REL = -3
|
|
|
|
class Token:
|
|
def __init__(self, type, *args):
|
|
"""Instanciate a new token."""
|
|
self.type = type
|
|
self.args = args
|
|
|
|
def __repr__(self):
|
|
"""Inambiguous token representation."""
|
|
try:
|
|
base = T(self.type).name
|
|
except ValueError:
|
|
base = f"<Token:{hex(self.type)}>"
|
|
|
|
if self.args:
|
|
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
|
|
else:
|
|
args = ""
|
|
|
|
return base + args
|
|
|
|
#---
|
|
# Lexer
|
|
#---
|
|
|
|
class ByteLexer:
|
|
"""
|
|
fx-92 SC+ language lexer with bytes() bitcode input.
|
|
"""
|
|
|
|
def __init__(self, hex):
|
|
"""Initialize the lexer with input data."""
|
|
self.hex = hex
|
|
self.rewind()
|
|
|
|
def rewind(self):
|
|
"""Restart lexing the same input."""
|
|
self.pos = 0
|
|
self.errors = 0
|
|
|
|
def lex(self):
|
|
"""Return the next token in the stream."""
|
|
h, p = self.hex, self.pos
|
|
|
|
if self.at_end():
|
|
return Token(T.END)
|
|
|
|
# 2-byte commands
|
|
|
|
if h[p] in [0xF9, 0xFB]:
|
|
# Stop if there is no trailing byte
|
|
if p >= len(h) - 1:
|
|
print(f"[lexer] Invalid trailing byte {hex(h[p])}")
|
|
p = len(h)
|
|
return Token(T.END)
|
|
|
|
# Return any value that is defined in the Token class
|
|
code = (h[p] << 8) | h[p+1]
|
|
self.pos += 2
|
|
|
|
try:
|
|
return Token(T(code))
|
|
except:
|
|
pass
|
|
|
|
# Also a few more values not in the Token class
|
|
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
|
|
if h[p] == 0xFB and h[p+1] in rels:
|
|
return Token(T.REL, rels[h[p+1]])
|
|
|
|
print(f"[lexer] Unknown opcode {hex(code)}")
|
|
self.errors += 1
|
|
|
|
# Try to read another token
|
|
return self.lex()
|
|
|
|
# Single-byte characters
|
|
|
|
self.pos += 1
|
|
code = h[p]
|
|
|
|
# Translate unary minus to normal minus
|
|
if code == 0xC0:
|
|
code = 0xA7
|
|
|
|
try:
|
|
return Token(T(code))
|
|
except:
|
|
pass
|
|
|
|
if code == 0x21:
|
|
return Token(T.CONST, math.e)
|
|
if code == 0x22:
|
|
return Token(T.CONST, math.pi)
|
|
|
|
# Constants
|
|
if code in range(0x30, 0x39+1):
|
|
# Never thought pointer arithmetic would beat Python. Grr!
|
|
re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
|
|
match = re.match(re_const, h[p:])
|
|
|
|
if match is not None:
|
|
text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
|
|
self.pos += len(text) - 1
|
|
|
|
f = float(text.decode('utf-8'))
|
|
if match[2] == "%":
|
|
f /= 100
|
|
return Token(T.CONST, f)
|
|
|
|
# Variables
|
|
if code in range(0x42, 0x47+1):
|
|
return Token(T.VAR, chr(h[p]-1))
|
|
if code == 0x40:
|
|
return Token(T.VAR, "M")
|
|
if code == 0x48:
|
|
return Token(T.VAR, "x")
|
|
if code == 0x49:
|
|
return Token(T.VAR, "y")
|
|
|
|
print(f"[lexer] Unknown opcode {hex(code)}")
|
|
self.errors += 1
|
|
|
|
# Try to read another token
|
|
return self.lex()
|
|
|
|
def at_end(self):
|
|
"""Check whether the whole input has been read."""
|
|
return self.pos >= len(self.hex)
|
|
|