# fx-92 Scientifique Collège+ language interpreter: Lexical analysis import math import re import enum #--- # Token description #--- @enum.unique class T(enum.IntEnum): # Basic commands EOL = 0xF901 END = 0xF902 NOP = 0xF903 # Basic statements FORWARD = 0xF905 ROTATE = 0xF906 ORIENT = 0xF907 GOTO = 0xF908 PENDOWN = 0xF909 PENUP = 0xF90A SETVAR = 0xF90B INPUT = 0xF90C MESSAGE = 0xF90D PRINT = 0xF90E STYLE = 0xF90F WAIT = 0xF910 # Flow control REPEAT = 0xF911 REPEAT_END = 0xF912 WHILE = 0xF913 WHILE_END = 0xF914 IF = 0xF915 IF_END = 0xF916 IFELSE = 0xF917 ELSE = 0xF918 IFELSE_END = 0xF919 # Variable assignment SETM = 0xFB10 SETA = 0xFB12 SETB = 0xFB13 SETC = 0xFB14 SETD = 0xFB15 SETE = 0xFB16 SETF = 0xFB17 SETX = 0xFB18 SETY = 0xFB19 # Miscellaneous PARAM = 0x00 COLON = 0x23 QUEST = 0x25 LPAR = 0x60 RPAR = 0xD0 EQUAL = 0xA5 PLUS = 0xA6 MINUS = 0xA7 STAR = 0xA8 SLASH = 0xA9 BANG = 0xD8 # Tokens with parameters CONST = -1 VAR = -2 REL = -3 FUN = -4 class Token: def __init__(self, type, *args): """Instanciate a new token.""" self.type = type self.args = args def __repr__(self): """Inambiguous token representation.""" try: base = T(self.type).name except ValueError: base = "".format(hex(self.type)) if self.args: args = "(" + ",".join(repr(arg) for arg in self.args) + ")" else: args = "" return base + args #--- # Lexer base #--- class LexerBase: """ Lexer base class. This class only provides common methods and cannot be used to analyse a program. """ def dump(self): self.rewind() while not self.at_end(): x = self.lex() print("{:5d}: {}".format(self.position, x)) #--- # Bitcode lexer #--- class BitcodeLexer(LexerBase): """ fx-92 SC+ language lexer with bytes() bitcode input. """ def __init__(self, hex): """Initialize the lexer with input data.""" self.hex = hex self.rewind() def rewind(self): """Restart lexing the same input.""" self.pos = 0 self.position = 0 self.errors = 0 def lex(self): """Return the next token in the stream.""" h, p = self.hex, self.pos self.position += 1 if self.at_end(): return Token(T.END) # 2-byte commands if h[p] in [0xF9, 0xFB]: # Stop if there is no trailing byte if p >= len(h) - 1: print("[lexer] Invalid trailing byte {}".format(hex(h[p]))) p = len(h) return Token(T.END) # Return any value that is defined in the Token class code = (h[p] << 8) | h[p+1] self.pos += 2 try: return Token(T(code)) except: pass # Also a few more values not in the Token class rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" } if h[p] == 0xFB and h[p+1] in rels: return Token(T.REL, rels[h[p+1]]) print("[lexer] Unknown opcode {}".format(hex(code))) self.errors += 1 # Try to read another token return self.lex() # Single-byte characters self.pos += 1 code = h[p] # Translate unary minus to normal minus if code == 0xC0: code = 0xA7 try: return Token(T(code)) except: pass if code == 0x21: return Token(T.CONST, math.e) if code == 0x22: return Token(T.CONST, math.pi) # Constants if code in range(0x30, 0x39+1): # Never thought pointer arithmetic would beat Python. Grr! re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)' match = re.match(re_const, h[p:]) if match is not None: text = match.group(1).replace(b'\x2E', b'.').replace(b'\x2D', b'e') self.pos += len(text) - 1 f = float(text.decode('utf-8')) if match.group(2) == "%": f /= 100 return Token(T.CONST, f) # Variables if code in range(0x42, 0x47+1): return Token(T.VAR, chr(h[p]-1)) if code == 0x40: return Token(T.VAR, "M") if code == 0x48: return Token(T.VAR, "x") if code == 0x49: return Token(T.VAR, "y") # Functions fun = { 0x68: "Abs", 0x69: "Rnd", 0x6C: "sinh", 0x6D: "cosh", 0x6E: "tanh", 0x6F: "asinh", 0x70: "acosh", 0x71: "atanh", # 0x72: "exp", # 0x73: "exp10", 0x74: "sqrt", 0x75: "log", 0x76: "cbrt", 0x77: "sin", 0x78: "cos", 0x79: "tan", 0x7A: "asin", 0x7B: "acos", 0x7C: "atan", 0x7D: "log10", 0x83: "Ent", 0x84: "EntEx", 0x87: "RanInt", # 0x88: "GCD", # 0x89: "LCM", 0x8A: "Arond", } if code in fun: return Token(T.FUN, fun[code]) print("[lexer] Unknown opcode {}".format(hex(code))) self.errors += 1 # Try to read another token after skipping one byte self.position -= 1 return self.lex() def at_end(self): """Check whether the whole input has been read.""" return self.pos >= len(self.hex) #--- # Url lexer #--- class UrlLexer(BitcodeLexer): """ fx-92 SC+ language lexer with a wes.casio.com URL or hexadecimal input. The URLs are typically in this form: http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000 0000+S-000410110000100E0010B300D365+E-{code...} The program can also be provided in text hexadecimal form, which is everything following the "+E-" in the URL. """ def __init__(self, url): if url.startswith("http://") \ or url.startswith("https://") \ or url.startswith("wes.casio.com"): print("[urlparser] URL includes protocol, will start after '+E-'") offset = url.find("+E-") if offset < 0: print("[urlparser] '+E-' not found, cannot decode URL") raise Exception("Cannot decode URL") url = url[offset+3:] if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url): print("[urlparser] URL is not strict hexa, noise will be skipped") super().__init__(bytes.fromhex(url)) #--- # Plain text lexer #--- class TextLexer(LexerBase): """ fx-92 SC+ language lexer with Basic-like input. This thing is very naive and extremely inefficient. """ RE_STMTS = re.compile( r"NOP|FORWARD|ROTATE|ORIENT|GOTO|PENDOWN|PENUP|SETVAR|INPUT|MESSAGE|" r"PRINT|STYLE|WAIT|REPEAT_END|REPEAT|WHILE_END|WHILE|IF_END|ELSE|" r"IFELSE_END|IFELSE|IF", re.IGNORECASE) RE_CONST = re.compile( r"([0-9]+(?:\.[0-9]+)?(?:[eE][0-9]+)?)(%?)") RE_FUN = re.compile( r"([a-zA-Z]+)\(") def __init__(self, code): """Initialize the lexer with text code.""" self.base_code = code.replace(";", "\n") self.rewind() def rewind(self): """Restart lexing the same input.""" self.code = self.base_code self.position = 0 self.errors = 0 self.pending_param = False def lex(self): """Return the next token in the stream.""" self.position += 1 c = self.code.lstrip(" \t") # Special case of newlines. If a non-statement has been identified and # no comma has followed, emit a PARAM token manually. if (not c or c[0] == "\n") and self.pending_param: self.pending_param = False self.code = c.lstrip("\n") return Token(T.PARAM) c = self.code.lstrip(" \t\n") # End of file if not c: self.code = "" return Token(T.END) # Statements m = re.match(self.RE_STMTS, c) if m is not None: t = Token(getattr(T, m[0].upper())) self.code = c[len(m[0]):] return t # Relations rels = [ ">=", "<=", "!=", ">", "<" ] for r in rels: if c.startswith(r): self.code = c[len(r):] self.pending_param = True return Token(T.REL, r) # Punctuation punct = { ",": T.PARAM, ":": T.COLON, "?": T.QUEST, "(": T.LPAR, ")": T.RPAR, "=": T.EQUAL, "+": T.PLUS, "-": T.MINUS, "*": T.STAR, "/": T.SLASH, "!": T.BANG, } if c[0] in punct: self.code = c[1:] self.pending_param = (c[0] != ",") return Token(punct[c[0]]) # Constants m = re.match(self.RE_CONST, c) if m is not None: f = float(m[1]) if m[2] == "%": f /= 100 self.code = c[len(m[0]):] self.pending_param = True return Token(T.CONST, f) # Functions m = re.match(self.RE_FUN, c) if m is not None: self.code = c[len(m[0]):] self.pending_param = True return Token(T.FUN, m[1]) # Variables if c[0] in "MABCDEFxXyY": var = c[0].lower() if c[0] in "xXyY" else c[0] self.code = c[1:] self.pending_param = True return Token(T.VAR, c[0]) # Comments if c[0] == "#": splits = c.split('\n', maxsplit=1) print(splits) self.code = c[len(splits[0]):] self.position -= 1 return self.lex() # If nothing can be found, raise an exception s = c.split(maxsplit=1) err = s[0] self.code = s[1] if len(s) > 1 else "" raise Exception("Lexical error near '{}'".format(err)) def at_end(self): """Check whether the whole input has been read.""" return not self.code and not self.pending_param # __all__ = ["T", "Token", "BitcodeLexer", "UrlLexer", "TextLexer"]