416 lines
10 KiB
Python
416 lines
10 KiB
Python
# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
|
|
|
|
import math
|
|
import re
|
|
import enum
|
|
|
|
#---
|
|
# Token description
|
|
#---
|
|
|
|
@enum.unique
|
|
class T(enum.IntEnum):
|
|
# Basic commands
|
|
EOL = 0xF901
|
|
END = 0xF902
|
|
NOP = 0xF903
|
|
|
|
# Basic statements
|
|
FORWARD = 0xF905
|
|
ROTATE = 0xF906
|
|
ORIENT = 0xF907
|
|
GOTO = 0xF908
|
|
PENDOWN = 0xF909
|
|
PENUP = 0xF90A
|
|
SETVAR = 0xF90B
|
|
INPUT = 0xF90C
|
|
MESSAGE = 0xF90D
|
|
PRINT = 0xF90E
|
|
STYLE = 0xF90F
|
|
WAIT = 0xF910
|
|
|
|
# Flow control
|
|
REPEAT = 0xF911
|
|
REPEAT_END = 0xF912
|
|
WHILE = 0xF913
|
|
WHILE_END = 0xF914
|
|
IF = 0xF915
|
|
IF_END = 0xF916
|
|
IFELSE = 0xF917
|
|
ELSE = 0xF918
|
|
IFELSE_END = 0xF919
|
|
|
|
# Variable assignment
|
|
SETM = 0xFB10
|
|
SETA = 0xFB12
|
|
SETB = 0xFB13
|
|
SETC = 0xFB14
|
|
SETD = 0xFB15
|
|
SETE = 0xFB16
|
|
SETF = 0xFB17
|
|
SETX = 0xFB18
|
|
SETY = 0xFB19
|
|
|
|
# Miscellaneous
|
|
PARAM = 0x00
|
|
COLON = 0x23
|
|
QUEST = 0x25
|
|
LPAR = 0x60
|
|
RPAR = 0xD0
|
|
EQUAL = 0xA5
|
|
PLUS = 0xA6
|
|
MINUS = 0xA7
|
|
STAR = 0xA8
|
|
SLASH = 0xA9
|
|
BANG = 0xD8
|
|
|
|
# Tokens with parameters
|
|
CONST = -1
|
|
VAR = -2
|
|
REL = -3
|
|
FUN = -4
|
|
|
|
class Token:
|
|
def __init__(self, type, *args):
|
|
"""Instanciate a new token."""
|
|
self.type = type
|
|
self.args = args
|
|
|
|
def __repr__(self):
|
|
"""Inambiguous token representation."""
|
|
try:
|
|
base = T(self.type).name
|
|
except ValueError:
|
|
base = "<Token:{}>".format(hex(self.type))
|
|
|
|
if self.args:
|
|
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
|
|
else:
|
|
args = ""
|
|
|
|
return base + args
|
|
|
|
#---
|
|
# Lexer base
|
|
#---
|
|
|
|
class LexerBase:
|
|
"""
|
|
Lexer base class. This class only provides common methods and cannot be
|
|
used to analyse a program.
|
|
"""
|
|
|
|
def dump(self):
|
|
self.rewind()
|
|
|
|
while not self.at_end():
|
|
x = self.lex()
|
|
print("{:5d}: {}".format(self.position, x))
|
|
|
|
#---
|
|
# Bitcode lexer
|
|
#---
|
|
|
|
class BitcodeLexer(LexerBase):
|
|
"""
|
|
fx-92 SC+ language lexer with bytes() bitcode input.
|
|
"""
|
|
|
|
def __init__(self, hex):
|
|
"""Initialize the lexer with input data."""
|
|
self.hex = hex
|
|
self.rewind()
|
|
|
|
def rewind(self):
|
|
"""Restart lexing the same input."""
|
|
self.pos = 0
|
|
self.position = 0
|
|
self.errors = 0
|
|
|
|
def lex(self):
|
|
"""Return the next token in the stream."""
|
|
h, p = self.hex, self.pos
|
|
self.position += 1
|
|
|
|
if self.at_end():
|
|
return Token(T.END)
|
|
|
|
# 2-byte commands
|
|
|
|
if h[p] in [0xF9, 0xFB]:
|
|
# Stop if there is no trailing byte
|
|
if p >= len(h) - 1:
|
|
print("[lexer] Invalid trailing byte {}".format(hex(h[p])))
|
|
p = len(h)
|
|
return Token(T.END)
|
|
|
|
# Return any value that is defined in the Token class
|
|
code = (h[p] << 8) | h[p+1]
|
|
self.pos += 2
|
|
|
|
try:
|
|
return Token(T(code))
|
|
except:
|
|
pass
|
|
|
|
# Also a few more values not in the Token class
|
|
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
|
|
if h[p] == 0xFB and h[p+1] in rels:
|
|
return Token(T.REL, rels[h[p+1]])
|
|
|
|
print("[lexer] Unknown opcode {}".format(hex(code)))
|
|
self.errors += 1
|
|
|
|
# Try to read another token
|
|
return self.lex()
|
|
|
|
# Single-byte characters
|
|
|
|
self.pos += 1
|
|
code = h[p]
|
|
|
|
# Translate unary minus to normal minus
|
|
if code == 0xC0:
|
|
code = 0xA7
|
|
|
|
try:
|
|
return Token(T(code))
|
|
except:
|
|
pass
|
|
|
|
if code == 0x21:
|
|
return Token(T.CONST, math.e)
|
|
if code == 0x22:
|
|
return Token(T.CONST, math.pi)
|
|
|
|
# Constants
|
|
if code in range(0x30, 0x39+1):
|
|
# Never thought pointer arithmetic would beat Python. Grr!
|
|
re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
|
|
match = re.match(re_const, h[p:])
|
|
|
|
if match is not None:
|
|
text = match.group(1).replace(b'\x2E', b'.').replace(b'\x2D', b'e')
|
|
self.pos += len(text) - 1
|
|
|
|
f = float(text.decode('utf-8'))
|
|
if match.group(2) == "%":
|
|
f /= 100
|
|
return Token(T.CONST, f)
|
|
|
|
# Variables
|
|
if code in range(0x42, 0x47+1):
|
|
return Token(T.VAR, chr(h[p]-1))
|
|
if code == 0x40:
|
|
return Token(T.VAR, "M")
|
|
if code == 0x48:
|
|
return Token(T.VAR, "x")
|
|
if code == 0x49:
|
|
return Token(T.VAR, "y")
|
|
|
|
# Functions
|
|
fun = {
|
|
0x68: "Abs",
|
|
0x69: "Rnd",
|
|
0x6C: "sinh",
|
|
0x6D: "cosh",
|
|
0x6E: "tanh",
|
|
0x6F: "asinh",
|
|
0x70: "acosh",
|
|
0x71: "atanh",
|
|
# 0x72: "exp",
|
|
# 0x73: "exp10",
|
|
0x74: "sqrt",
|
|
0x75: "log",
|
|
0x76: "cbrt",
|
|
0x77: "sin",
|
|
0x78: "cos",
|
|
0x79: "tan",
|
|
0x7A: "asin",
|
|
0x7B: "acos",
|
|
0x7C: "atan",
|
|
0x7D: "log10",
|
|
0x83: "Ent",
|
|
0x84: "EntEx",
|
|
0x87: "RanInt",
|
|
# 0x88: "GCD",
|
|
# 0x89: "LCM",
|
|
0x8A: "Arond",
|
|
}
|
|
|
|
if code in fun:
|
|
return Token(T.FUN, fun[code])
|
|
|
|
print("[lexer] Unknown opcode {}".format(hex(code)))
|
|
self.errors += 1
|
|
|
|
# Try to read another token after skipping one byte
|
|
self.position -= 1
|
|
return self.lex()
|
|
|
|
def at_end(self):
|
|
"""Check whether the whole input has been read."""
|
|
return self.pos >= len(self.hex)
|
|
|
|
#---
|
|
# Url lexer
|
|
#---
|
|
|
|
class UrlLexer(BitcodeLexer):
|
|
"""
|
|
fx-92 SC+ language lexer with a wes.casio.com URL or hexadecimal input.
|
|
The URLs are typically in this form:
|
|
|
|
http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000
|
|
0000+S-000410110000100E0010B300D365+E-{code...}
|
|
|
|
The program can also be provided in text hexadecimal form, which is
|
|
everything following the "+E-" in the URL.
|
|
"""
|
|
|
|
def __init__(self, url):
|
|
if url.startswith("http://") \
|
|
or url.startswith("https://") \
|
|
or url.startswith("wes.casio.com"):
|
|
print("[urlparser] URL includes protocol, will start after '+E-'")
|
|
offset = url.find("+E-")
|
|
|
|
if offset < 0:
|
|
print("[urlparser] '+E-' not found, cannot decode URL")
|
|
raise Exception("Cannot decode URL")
|
|
|
|
url = url[offset+3:]
|
|
|
|
if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url):
|
|
print("[urlparser] URL is not strict hexa, noise will be skipped")
|
|
super().__init__(bytes.fromhex(url))
|
|
|
|
#---
|
|
# Plain text lexer
|
|
#---
|
|
|
|
class TextLexer(LexerBase):
|
|
"""
|
|
fx-92 SC+ language lexer with Basic-like input.
|
|
|
|
This thing is very naive and extremely inefficient.
|
|
"""
|
|
|
|
RE_STMTS = re.compile(
|
|
r"NOP|FORWARD|ROTATE|ORIENT|GOTO|PENDOWN|PENUP|SETVAR|INPUT|MESSAGE|"
|
|
r"PRINT|STYLE|WAIT|REPEAT_END|REPEAT|WHILE_END|WHILE|IF_END|ELSE|"
|
|
r"IFELSE_END|IFELSE|IF",
|
|
re.IGNORECASE)
|
|
|
|
RE_CONST = re.compile(
|
|
r"([0-9]+(?:\.[0-9]+)?(?:[eE][0-9]+)?)(%?)")
|
|
|
|
def __init__(self, code):
|
|
"""Initialize the lexer with text code."""
|
|
|
|
self.base_code = code.replace(";", "\n")
|
|
self.rewind()
|
|
|
|
def rewind(self):
|
|
"""Restart lexing the same input."""
|
|
|
|
self.code = self.base_code
|
|
self.position = 0
|
|
self.errors = 0
|
|
self.pending_param = False
|
|
|
|
def lex(self):
|
|
"""Return the next token in the stream."""
|
|
self.position += 1
|
|
|
|
c = self.code.lstrip(" \t")
|
|
|
|
# Special case of newlines. If a non-statement has been identified and
|
|
# no comma has followed, emit a PARAM token manually.
|
|
if (not c or c[0] == "\n") and self.pending_param:
|
|
self.pending_param = False
|
|
self.code = c.lstrip("\n")
|
|
return Token(T.PARAM)
|
|
|
|
c = self.code.lstrip(" \t\n")
|
|
|
|
# End of file
|
|
if not c:
|
|
self.code = ""
|
|
return Token(T.END)
|
|
|
|
# Statements
|
|
m = re.match(self.RE_STMTS, c)
|
|
if m is not None:
|
|
t = Token(getattr(T, m[0].upper()))
|
|
self.code = c[len(m[0]):]
|
|
return t
|
|
|
|
# Relations
|
|
rels = [ ">=", "<=", "!=", ">", "<" ]
|
|
for r in rels:
|
|
if c.startswith(r):
|
|
self.code = c[len(r):]
|
|
self.pending_param = True
|
|
return Token(T.REL, r)
|
|
|
|
# Punctuation
|
|
punct = {
|
|
",": T.PARAM,
|
|
":": T.COLON,
|
|
"?": T.QUEST,
|
|
"(": T.LPAR,
|
|
")": T.RPAR,
|
|
"=": T.EQUAL,
|
|
"+": T.PLUS,
|
|
"-": T.MINUS,
|
|
"*": T.STAR,
|
|
"/": T.SLASH,
|
|
"!": T.BANG,
|
|
}
|
|
if c[0] in punct:
|
|
self.code = c[1:]
|
|
self.pending_param = (c[0] != ",")
|
|
return Token(punct[c[0]])
|
|
|
|
# Constants
|
|
m = re.match(self.RE_CONST, c)
|
|
if m is not None:
|
|
f = float(m[1])
|
|
if m[2] == "%":
|
|
f /= 100
|
|
|
|
self.code = c[len(m[0]):]
|
|
self.pending_param = True
|
|
return Token(T.CONST, f)
|
|
|
|
# Variables
|
|
if c[0] in "MABCDEFxXyY":
|
|
var = c[0].lower() if c[0] in "xXyY" else c[0]
|
|
self.code = c[1:]
|
|
self.pending_param = True
|
|
return Token(T.VAR, c[0])
|
|
|
|
# If nothing can be found, raise an exception
|
|
s = c.split(maxsplit=1)
|
|
err = s[0]
|
|
self.code = s[1] if len(s) > 1 else ""
|
|
|
|
# Comments
|
|
if c[0] == "#":
|
|
splits = c.split('\n', maxsplit=1)
|
|
print(splits)
|
|
self.code = c[len(splits[0]):]
|
|
self.position -= 1
|
|
return self.lex()
|
|
|
|
raise Exception("Lexical error near '{}'".format(err))
|
|
|
|
def at_end(self):
|
|
"""Check whether the whole input has been read."""
|
|
return not self.code and not self.pending_param
|
|
|
|
#
|
|
|
|
__all__ = ["T", "Token", "BitcodeLexer", "UrlLexer", "TextLexer"]
|