fx92-interpreter/fx92/lexer.py

465 lines
12 KiB
Python

# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
import math
import re
import enum
from decimal import Decimal
#---
# Token description
#---
@enum.unique
class T(enum.IntEnum):
# Basic commands
EOL = 0xF901
END = 0xF902
NOP = 0xF903
# Basic statements
FORWARD = 0xF905
ROTATE = 0xF906
ORIENT = 0xF907
GOTO = 0xF908
PENDOWN = 0xF909
PENUP = 0xF90A
SETVAR = 0xF90B
INPUT = 0xF90C
MESSAGE = 0xF90D
PRINT = 0xF90E
STYLE = 0xF90F
WAIT = 0xF910
# Flow control
REPEAT = 0xF911
REPEAT_END = 0xF912
UNTIL = 0xF913
UNTIL_END = 0xF914
IF = 0xF915
IF_END = 0xF916
IFELSE = 0xF917
ELSE = 0xF918
IFELSE_END = 0xF919
# Variable assignment
SETM = 0xFB10
SETA = 0xFB12
SETB = 0xFB13
SETC = 0xFB14
SETD = 0xFB15
SETE = 0xFB16
SETF = 0xFB17
SETX = 0xFB18
SETY = 0xFB19
# Miscellaneous
PARAM = 0x00
COLON = 0x23
QUEST = 0x25
SEMI = 0x2C
LPAR = 0x60
RPAR = 0xD0
PLUS = 0xA6
MINUS = 0xA7
STAR = 0xA8
SLASH = 0xA9
EXP = 0xC9
BANG = 0xD8
# Tokens with parameters
CONST = -1
VAR = -2
REL = -3
FUN = -4
class Token:
def __init__(self, type, *args):
"""Instanciate a new token."""
self.type = type
self.args = args
def __repr__(self):
"""Inambiguous token representation."""
try:
base = T(self.type).name
except ValueError:
base = "<Token:{}>".format(hex(self.type))
if self.type == T.CONST:
args = "({}) [typed as {}]".format(*self.args)
elif self.args:
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
else:
args = ""
return base + args
#---
# Utilities
#---
def str2float(integer, decimal, exponent, percent):
m1 = integer or "0"
m2 = ".0" if decimal == "." else (decimal or "")
m3 = exponent or ""
f = Decimal(m1 + m2 + m3)
if percent == "%":
f /= 100
return f
#---
# Lexer base
#---
class LexerBase:
"""
Lexer base class. This class only provides common methods and cannot be
used to analyse a program.
"""
def dump(self):
self.rewind()
while not self.at_end():
x = self.lex()
print("{:5d}: {}".format(self.position, x))
#---
# Bitcode lexer
#---
class BitcodeLexer(LexerBase):
"""
fx-92 SC+ language lexer with bytes() bitcode input.
"""
def __init__(self, hex):
"""Initialize the lexer with input data."""
self.hex = hex
self.rewind()
def rewind(self):
"""Restart lexing the same input."""
self.pos = 0
self.position = 0
self.errors = 0
def lex(self):
"""Return the next token in the stream."""
h, p = self.hex, self.pos
self.position += 1
if self.at_end():
return Token(T.END)
# 2-byte commands
if h[p] in [0xF9, 0xFB]:
# Stop if there is no trailing byte
if p >= len(h) - 1:
print("[lexer] Invalid trailing byte {}".format(hex(h[p])))
p = len(h)
return Token(T.END)
# Return any value that is defined in the Token class
code = (h[p] << 8) | h[p+1]
self.pos += 2
try:
return Token(T(code))
except:
pass
# Also a few more values not in the Token class
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
if h[p] == 0xFB and h[p+1] in rels:
return Token(T.REL, rels[h[p+1]])
print("[lexer] Unknown opcode {}".format(hex(code)))
self.errors += 1
# Try to read another token
return self.lex()
# Single-byte characters
self.pos += 1
code = h[p]
# Translate unary minus to normal minus
if code == 0xC0:
code = 0xA7
# Equal symbol
if h[p] == 0xA5:
return Token(T.REL, "=")
try:
return Token(T(code))
except:
pass
if code == 0x21:
return Token(T.CONST, Decimal(math.e), "[e]")
if code == 0x22:
return Token(T.CONST, Decimal(math.pi), "[pi]")
# Constants
if code in range(0x30, 0x39+1) or code == 0x2E:
# Never thought pointer arithmetic would beat Python. Grr!
re_const=rb'([0-9]*)(\x2E[0-9]*)?(\x2D[\xA6\xA7\xC0]?[0-9]+)?(%)?'
m = re.match(re_const, h[p:])
if m is not None:
integer = (m[1] or b"").decode()
decimal = (m[2] or b".").replace(b'\x2E', b'.').decode()
exp = (m[3] or b"")
percent = (m[4] or b"").decode()
exp = exp.replace(b'\x2D', b'e')
exp = exp.replace(b'\xA6', b'+')
exp = exp.replace(b'\xA7', b'-')
exp = exp.replace(b'\xC0', b'-')
exp = exp.decode()
self.pos += len(m[0]) - 1
f = str2float(integer, decimal, exp, percent)
return Token(T.CONST, f, m[0])
# Variables
if code in range(0x42, 0x47+1):
return Token(T.VAR, chr(h[p]-1))
if code == 0x40:
return Token(T.VAR, "M")
if code == 0x48:
return Token(T.VAR, "x")
if code == 0x49:
return Token(T.VAR, "y")
if code == 0x4C:
return Token(T.VAR, "theta")
# Functions
fun = {
0x68: "Abs",
0x69: "Rnd",
0x6C: "sinh",
0x6D: "cosh",
0x6E: "tanh",
0x6F: "asinh",
0x70: "acosh",
0x71: "atanh",
# 0x72: "exp",
# 0x73: "exp10",
0x74: "sqrt",
0x75: "log",
0x76: "cbrt",
0x77: "sin",
0x78: "cos",
0x79: "tan",
0x7A: "asin",
0x7B: "acos",
0x7C: "atan",
0x7D: "log10",
0x83: "Ent",
0x84: "EntEx",
0x87: "RanInt",
0x88: "GCD",
0x89: "LCM",
0x8A: "Arond",
}
if code in fun:
return Token(T.FUN, fun[code])
print("[lexer] Unknown opcode {}".format(hex(code)))
self.errors += 1
# Try to read another token after skipping one byte
self.position -= 1
return self.lex()
def at_end(self):
"""Check whether the whole input has been read."""
return self.pos >= len(self.hex)
#---
# Url lexer
#---
class UrlLexer(BitcodeLexer):
"""
fx-92 SC+ language lexer with a wes.casio.com URL or hexadecimal input.
The URLs are typically in this form:
http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000
0000+S-000410110000100E0010B300D365+E-{code...}
The program can also be provided in text hexadecimal form, which is
everything following the "+E-" in the URL.
"""
def __init__(self, url):
if url.startswith("http://") \
or url.startswith("https://") \
or url.startswith("wes.casio.com"):
print("[urlparser] URL includes protocol, will start after '+E-'")
offset = url.find("+E-")
if offset < 0:
print("[urlparser] '+E-' not found, cannot decode URL")
raise Exception("Cannot decode URL")
url = url[offset+3:]
if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url):
print("[urlparser] URL is not strict hexa, noise will be skipped")
super().__init__(bytes.fromhex(url))
#---
# Plain text lexer
#---
class TextLexer(LexerBase):
"""
fx-92 SC+ language lexer with Basic-like input.
This thing is very naive and extremely inefficient.
"""
RE_STMTS = re.compile(
r"NOP|FORWARD|ROTATE|ORIENT|GOTO|PENDOWN|PENUP|SETVAR|INPUT|MESSAGE|"
r"PRINT|STYLE|WAIT|REPEAT_END|REPEAT|UNTIL_END|UNTIL|IF_END|ELSE|"
r"IFELSE_END|IFELSE|IF",
re.IGNORECASE)
RE_CONST = re.compile(
r"([0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?(%)?")
RE_FUN = re.compile(
r"([a-zA-Z]+)\(")
def __init__(self, code):
"""Initialize the lexer with text code."""
self.base_code = code
self.rewind()
def rewind(self):
"""Restart lexing the same input."""
self.code = self.base_code
self.position = 0
self.errors = 0
self.pending_param = False
def lex(self):
"""Return the next token in the stream."""
self.position += 1
c = self.code.lstrip(" \t")
# Special case of newlines. If a non-statement has been identified and
# no comma has followed, emit a PARAM token manually.
if (not c or c[0] == "\n") and self.pending_param:
self.pending_param = False
self.code = c.lstrip("\n")
return Token(T.PARAM)
c = self.code.lstrip(" \t\n")
# End of file
if not c:
self.code = ""
return Token(T.END)
# Statements
m = re.match(self.RE_STMTS, c)
if m is not None:
t = Token(getattr(T, m[0].upper()))
self.code = c[len(m[0]):]
return t
# Relations
rels = [ ">=", "<=", "!=", ">", "<" ]
for r in rels:
if c.startswith(r):
self.code = c[len(r):]
self.pending_param = True
return Token(T.REL, r)
if c[0] == "=":
self.code = c[1:]
self.pending_param = True
return Token(T.REL, "=")
# Punctuation
punct = {
",": T.PARAM,
":": T.COLON,
";": T.SEMI,
"?": T.QUEST,
"(": T.LPAR,
")": T.RPAR,
"+": T.PLUS,
"-": T.MINUS,
"*": T.STAR,
"/": T.SLASH,
"!": T.BANG,
}
if c[0] in punct:
self.code = c[1:]
self.pending_param = (c[0] != ",")
return Token(punct[c[0]])
# Constants
if c[0] in "0123456789.":
m = re.match(self.RE_CONST, c)
if m is not None:
f = str2float(m[1], m[2], m[3], m[4])
self.code = c[len(m[0]):]
self.pending_param = True
return Token(T.CONST, f, m[0])
# Functions
m = re.match(self.RE_FUN, c)
if m is not None:
self.code = c[len(m[0]):]
self.pending_param = True
return Token(T.FUN, m[1])
# Variables
if c[0] in "MABCDEFxXyY":
var = c[0].lower() if c[0] in "xXyY" else c[0]
self.code = c[1:]
self.pending_param = True
return Token(T.VAR, c[0])
m = re.match(r"theta\b", c)
if m is not None:
self.code = c[len(m[0]):]
self.pending_param = True
return Token(T.VAR, "theta")
# Comments
if c[0] == "#":
splits = c.split('\n', maxsplit=1)
print(splits)
self.code = c[len(splits[0]):]
self.position -= 1
return self.lex()
# If nothing can be found, raise an exception
s = c.split(maxsplit=1)
err = s[0]
self.code = s[1] if len(s) > 1 else ""
raise Exception("Lexical error near '{}'".format(err))
def at_end(self):
"""Check whether the whole input has been read."""
return not self.code and not self.pending_param
#
__all__ = ["T", "Token", "BitcodeLexer", "UrlLexer", "TextLexer"]