# fx-92 Scientifique Collège+ language interpreter: Lexical analysis import math import re import enum #--- # Token description #--- @enum.unique class T(enum.IntEnum): # Basic commands EOL = 0xF901 END = 0xF902 NOP = 0xF903 # Basic statements FORWARD = 0xF905 ROTATE = 0xF906 ORIENT = 0xF907 GOTO = 0xF908 PENDOWN = 0xF909 PENUP = 0xF90A SETVAR = 0xF90B INPUT = 0xF90C MESSAGE = 0xF90D PRINT = 0xF90E STYLE = 0xF90F WAIT = 0xF910 # Flow control REPEAT = 0xF911 REPEAT_END = 0xF912 WHILE = 0xF913 WHILE_END = 0xF914 IF = 0xF915 IF_END = 0xF916 IFELSE = 0xF917 ELSE = 0xF918 IFELSE_END = 0xF919 # Variable assignment SETM = 0xFB10 SETA = 0xFB12 SETB = 0xFB13 SETC = 0xFB14 SETD = 0xFB15 SETE = 0xFB16 SETF = 0xFB17 SETX = 0xFB18 SETY = 0xFB19 # Miscellaneous PARAM = 0x00 COLON = 0x23 QUEST = 0x25 LPAR = 0x60 RPAR = 0xD0 EQUAL = 0xA5 PLUS = 0xA6 MINUS = 0xA7 STAR = 0xA8 SLASH = 0xA9 BANG = 0xD8 # Tokens with parameters CONST = -1 VAR = -2 REL = -3 class Token: def __init__(self, type, *args): """Instanciate a new token.""" self.type = type self.args = args def __repr__(self): """Inambiguous token representation.""" try: base = T(self.type).name except ValueError: base = f"" if self.args: args = "(" + ",".join(repr(arg) for arg in self.args) + ")" else: args = "" return base + args #--- # Lexer #--- class ByteLexer: """ fx-92 SC+ language lexer with bytes() bitcode input. """ def __init__(self, hex): """Initialize the lexer with input data.""" self.hex = hex self.rewind() def rewind(self): """Restart lexing the same input.""" self.pos = 0 self.errors = 0 def lex(self): """Return the next token in the stream.""" h, p = self.hex, self.pos if self.at_end(): return Token(T.END) # 2-byte commands if h[p] in [0xF9, 0xFB]: # Stop if there is no trailing byte if p >= len(h) - 1: print(f"[lexer] Invalid trailing byte {hex(h[p])}") p = len(h) return Token(T.END) # Return any value that is defined in the Token class code = (h[p] << 8) | h[p+1] self.pos += 2 try: return Token(T(code)) except: pass # Also a few more values not in the Token class rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" } if h[p] == 0xFB and h[p+1] in rels: return Token(T.REL, rels[h[p+1]]) print(f"[lexer] Unknown opcode {hex(code)}") self.errors += 1 # Try to read another token return self.lex() # Single-byte characters self.pos += 1 code = h[p] # Translate unary minus to normal minus if code == 0xC0: code = 0xA7 try: return Token(T(code)) except: pass if code == 0x21: return Token(T.CONST, math.e) if code == 0x22: return Token(T.CONST, math.pi) # Constants if code in range(0x30, 0x39+1): # Never thought pointer arithmetic would beat Python. Grr! re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)' match = re.match(re_const, h[p:]) if match is not None: text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e') self.pos += len(text) - 1 f = float(text.decode('utf-8')) if match[2] == "%": f /= 100 return Token(T.CONST, f) # Variables if code in range(0x42, 0x47+1): return Token(T.VAR, chr(h[p]-1)) if code == 0x40: return Token(T.VAR, "M") if code == 0x48: return Token(T.VAR, "x") if code == 0x49: return Token(T.VAR, "y") print(f"[lexer] Unknown opcode {hex(code)}") self.errors += 1 # Try to read another token return self.lex() def at_end(self): """Check whether the whole input has been read.""" return self.pos >= len(self.hex)