# fx-92 Scientifique Collège+ language interpreter: Lexical analysis import math import re import enum from decimal import Decimal #--- # Token description #--- @enum.unique class T(enum.IntEnum): # Basic commands EOL = 0xF901 END = 0xF902 NOP = 0xF903 # Basic statements FORWARD = 0xF905 ROTATE = 0xF906 ORIENT = 0xF907 GOTO = 0xF908 PENDOWN = 0xF909 PENUP = 0xF90A SETVAR = 0xF90B INPUT = 0xF90C MESSAGE = 0xF90D PRINT = 0xF90E STYLE = 0xF90F WAIT = 0xF910 # Flow control REPEAT = 0xF911 REPEAT_END = 0xF912 UNTIL = 0xF913 UNTIL_END = 0xF914 IF = 0xF915 IF_END = 0xF916 IFELSE = 0xF917 ELSE = 0xF918 IFELSE_END = 0xF919 # Variable assignment SETM = 0xFB10 SETA = 0xFB12 SETB = 0xFB13 SETC = 0xFB14 SETD = 0xFB15 SETE = 0xFB16 SETF = 0xFB17 SETX = 0xFB18 SETY = 0xFB19 # Miscellaneous PARAM = 0x00 COLON = 0x23 QUEST = 0x25 SEMI = 0x2C LPAR = 0x60 RPAR = 0xD0 PLUS = 0xA6 MINUS = 0xA7 STAR = 0xA8 SLASH = 0xA9 EXP = 0xC9 BANG = 0xD8 # Tokens with parameters CONST = -1 VAR = -2 REL = -3 FUN = -4 class Token: def __init__(self, type, *args): """Instanciate a new token.""" self.type = type self.args = args def __repr__(self): """Inambiguous token representation.""" try: base = T(self.type).name except ValueError: base = "".format(hex(self.type)) if self.type == T.CONST: args = "({}) [typed as {}]".format(*self.args) elif self.args: args = "(" + ",".join(repr(arg) for arg in self.args) + ")" else: args = "" return base + args #--- # Utilities #--- def str2float(integer, decimal, exponent, percent): m1 = integer or "0" m2 = ".0" if decimal == "." else (decimal or "") m3 = exponent or "" f = Decimal(m1 + m2 + m3) if percent == "%": f /= 100 return f #--- # Lexer base #--- class LexerBase: """ Lexer base class. This class only provides common methods and cannot be used to analyse a program. """ def dump(self): self.rewind() while not self.at_end(): x = self.lex() print("{:5d}: {}".format(self.position, x)) #--- # Bitcode lexer #--- class BitcodeLexer(LexerBase): """ fx-92 SC+ language lexer with bytes() bitcode input. """ def __init__(self, hex): """Initialize the lexer with input data.""" self.hex = hex self.rewind() def rewind(self): """Restart lexing the same input.""" self.pos = 0 self.position = 0 self.errors = 0 def lex(self): """Return the next token in the stream.""" h, p = self.hex, self.pos self.position += 1 if self.at_end(): return Token(T.END) # 2-byte commands if h[p] in [0xF9, 0xFB]: # Stop if there is no trailing byte if p >= len(h) - 1: print("[lexer] Invalid trailing byte {}".format(hex(h[p]))) p = len(h) return Token(T.END) # Return any value that is defined in the Token class code = (h[p] << 8) | h[p+1] self.pos += 2 try: return Token(T(code)) except: pass # Also a few more values not in the Token class rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" } if h[p] == 0xFB and h[p+1] in rels: return Token(T.REL, rels[h[p+1]]) print("[lexer] Unknown opcode {}".format(hex(code))) self.errors += 1 # Try to read another token return self.lex() # Single-byte characters self.pos += 1 code = h[p] # Translate unary minus to normal minus if code == 0xC0: code = 0xA7 # Equal symbol if h[p] == 0xA5: return Token(T.REL, "=") try: return Token(T(code)) except: pass if code == 0x21: return Token(T.CONST, Decimal(math.e), "[e]") if code == 0x22: return Token(T.CONST, Decimal(math.pi), "[pi]") # Constants if code in range(0x30, 0x39+1) or code == 0x2E: # Never thought pointer arithmetic would beat Python. Grr! re_const=rb'([0-9]*)(\x2E[0-9]*)?(\x2D[\xA6\xA7\xC0]?[0-9]+)?(%)?' m = re.match(re_const, h[p:]) if m is not None: integer = (m[1] or b"").decode() decimal = (m[2] or b".").replace(b'\x2E', b'.').decode() exp = (m[3] or b"") percent = (m[4] or b"").decode() exp = exp.replace(b'\x2D', b'e') exp = exp.replace(b'\xA6', b'+') exp = exp.replace(b'\xA7', b'-') exp = exp.replace(b'\xC0', b'-') exp = exp.decode() self.pos += len(m[0]) - 1 f = str2float(integer, decimal, exp, percent) return Token(T.CONST, f, m[0]) # Variables if code in range(0x42, 0x47+1): return Token(T.VAR, chr(h[p]-1)) if code == 0x40: return Token(T.VAR, "M") if code == 0x48: return Token(T.VAR, "x") if code == 0x49: return Token(T.VAR, "y") if code == 0x4C: return Token(T.VAR, "theta") # Functions fun = { 0x68: "Abs", 0x69: "Rnd", 0x6C: "sinh", 0x6D: "cosh", 0x6E: "tanh", 0x6F: "asinh", 0x70: "acosh", 0x71: "atanh", # 0x72: "exp", # 0x73: "exp10", 0x74: "sqrt", 0x75: "log", 0x76: "cbrt", 0x77: "sin", 0x78: "cos", 0x79: "tan", 0x7A: "asin", 0x7B: "acos", 0x7C: "atan", 0x7D: "log10", 0x83: "Ent", 0x84: "EntEx", 0x87: "RanInt", 0x88: "GCD", 0x89: "LCM", 0x8A: "Arond", } if code in fun: return Token(T.FUN, fun[code]) print("[lexer] Unknown opcode {}".format(hex(code))) self.errors += 1 # Try to read another token after skipping one byte self.position -= 1 return self.lex() def at_end(self): """Check whether the whole input has been read.""" return self.pos >= len(self.hex) #--- # Url lexer #--- class UrlLexer(BitcodeLexer): """ fx-92 SC+ language lexer with a wes.casio.com URL or hexadecimal input. The URLs are typically in this form: http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000 0000+S-000410110000100E0010B300D365+E-{code...} The program can also be provided in text hexadecimal form, which is everything following the "+E-" in the URL. """ def __init__(self, url): if url.startswith("http://") \ or url.startswith("https://") \ or url.startswith("wes.casio.com"): print("[urlparser] URL includes protocol, will start after '+E-'") offset = url.find("+E-") if offset < 0: print("[urlparser] '+E-' not found, cannot decode URL") raise Exception("Cannot decode URL") url = url[offset+3:] if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url): print("[urlparser] URL is not strict hexa, noise will be skipped") super().__init__(bytes.fromhex(url)) #--- # Plain text lexer #--- class TextLexer(LexerBase): """ fx-92 SC+ language lexer with Basic-like input. This thing is very naive and extremely inefficient. """ RE_STMTS = re.compile( r"NOP|FORWARD|ROTATE|ORIENT|GOTO|PENDOWN|PENUP|SETVAR|INPUT|MESSAGE|" r"PRINT|STYLE|WAIT|REPEAT_END|REPEAT|UNTIL_END|UNTIL|IF_END|ELSE|" r"IFELSE_END|IFELSE|IF", re.IGNORECASE) RE_CONST = re.compile( r"([0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?(%)?") RE_FUN = re.compile( r"([a-zA-Z]+)\(") def __init__(self, code): """Initialize the lexer with text code.""" self.base_code = code self.rewind() def rewind(self): """Restart lexing the same input.""" self.code = self.base_code self.position = 0 self.errors = 0 self.pending_param = False def lex(self): """Return the next token in the stream.""" self.position += 1 c = self.code.lstrip(" \t") # Special case of newlines. If a non-statement has been identified and # no comma has followed, emit a PARAM token manually. if (not c or c[0] == "\n") and self.pending_param: self.pending_param = False self.code = c.lstrip("\n") return Token(T.PARAM) c = self.code.lstrip(" \t\n") # End of file if not c: self.code = "" return Token(T.END) # Statements m = re.match(self.RE_STMTS, c) if m is not None: t = Token(getattr(T, m[0].upper())) self.code = c[len(m[0]):] return t # Relations rels = [ ">=", "<=", "!=", ">", "<" ] for r in rels: if c.startswith(r): self.code = c[len(r):] self.pending_param = True return Token(T.REL, r) if c[0] == "=": self.code = c[1:] self.pending_param = True return Token(T.REL, "=") # Punctuation punct = { ",": T.PARAM, ":": T.COLON, ";": T.SEMI, "?": T.QUEST, "(": T.LPAR, ")": T.RPAR, "+": T.PLUS, "-": T.MINUS, "*": T.STAR, "/": T.SLASH, "!": T.BANG, } if c[0] in punct: self.code = c[1:] self.pending_param = (c[0] != ",") return Token(punct[c[0]]) # Constants if c[0] in "0123456789.": m = re.match(self.RE_CONST, c) if m is not None: f = str2float(m[1], m[2], m[3], m[4]) self.code = c[len(m[0]):] self.pending_param = True return Token(T.CONST, f, m[0]) # Functions m = re.match(self.RE_FUN, c) if m is not None: self.code = c[len(m[0]):] self.pending_param = True return Token(T.FUN, m[1]) # Variables if c[0] in "MABCDEFxXyY": var = c[0].lower() if c[0] in "xXyY" else c[0] self.code = c[1:] self.pending_param = True return Token(T.VAR, c[0]) m = re.match(r"theta\b", c) if m is not None: self.code = c[len(m[0]):] self.pending_param = True return Token(T.VAR, "theta") # Comments if c[0] == "#": splits = c.split('\n', maxsplit=1) print(splits) self.code = c[len(splits[0]):] self.position -= 1 return self.lex() # If nothing can be found, raise an exception s = c.split(maxsplit=1) err = s[0] self.code = s[1] if len(s) > 1 else "" raise Exception("Lexical error near '{}'".format(err)) def at_end(self): """Check whether the whole input has been read.""" return not self.code and not self.pending_param # __all__ = ["T", "Token", "BitcodeLexer", "UrlLexer", "TextLexer"]