initial commit: most of the lexing and parsing work

The program is currently able to lex most useful tokens, and parse constructs associated with them on simple examples. Unit tests are still missing to formally ensure everything's right.
2019-09-30 11:29:05 +02:00 · 2019-09-30 11:29:05 +02:00 · 6151144d0a
commit 6151144d0a
7 changed files with 780 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+# Python bytecodes
+__pycache__
+
+# Semantic exclusion
+exclude
--- a/ast.py
+++ b/ast.py
@ -0,0 +1,74 @@
+# fx-92 Scientifique Collège+ language interpreter: AST definition
+
+import enum
+
+#---
+# Internal AST node representation
+#---
+
+@enum.unique
+class N(enum.IntEnum):
+    # Core nodes
+    PROGRAM = enum.auto()
+
+    # Basic statements
+    FORWARD = enum.auto()
+    ROTATE  = enum.auto()
+    ORIENT  = enum.auto()
+    GOTO    = enum.auto()
+    PENDOWN = enum.auto()
+    PENUP   = enum.auto()
+    ASSIGN  = enum.auto()
+    INPUT   = enum.auto()
+    MESSAGE = enum.auto()
+    PRINT   = enum.auto()
+    STYLE   = enum.auto()
+    WAIT    = enum.auto()
+
+    # Flow control
+    REPEAT  = enum.auto()
+    WHILE   = enum.auto()
+    IFELSE  = enum.auto()
+
+    # Expressions
+    ADD     = enum.auto()
+    SUB     = enum.auto()
+    MUL     = enum.auto()
+    DIV     = enum.auto()
+    MINUS   = enum.auto()
+    EXP     = enum.auto()
+    VAR     = enum.auto()
+    CONST   = enum.auto()
+
+#---
+# AST nodes
+#---
+
+class Node:
+    def __init__(self, type, *args):
+        """Instantiate a new AST node."""
+        self.type = type
+        self.args = args
+
+    def __str__(self):
+        try:
+            name = N(self.type).name
+            return f"<Node:{name}>"
+        except ValueError:
+            return f"<Node:{hex(self.type)}>"
+
+    def simplify(self):
+        simpl = lambda n: n.simplify() if isinstance(n, Node) else n
+        self.args = [ simpl(arg) for arg in self.args ]
+        arity = len(self.args)
+
+        if self.type == N.MUL and arity == 0:
+            return Node(N.CONST, 1)
+        if self.type == N.MUL and arity == 1:
+            return self.args[0]
+        if self.type == N.ADD and arity == 0:
+            return Node(N.CONST, 0)
+        if self.type == N.ADD and arity == 1:
+            return self.args[0]
+
+        return self
--- a/doc/tokens.txt
+++ b/doc/tokens.txt
@ -0,0 +1,137 @@
+21 : e
+22 : π
+23 : :
+25 : ?
+2C : ;
+2D : ×10
+2E : ,
+30 : 0
+31 : 1
+32 : 2
+33 : 3
+34 : 4
+35 : 5
+36 : 6
+37 : 7
+38 : 8
+39 : 9
+40 : M
+41 : Rép (Ans)
+42 : A
+43 : B
+44 : C
+45 : D
+46 : E
+47 : F
+48 : x
+49 : y
+4A : Pré-Rép
+4C : θ
+60 : (
+68 : Abs(
+69 : Rnd(
+6C : sinh(
+6D : cosh(
+6E : tanh(
+6F : sinh⁻¹(
+70 : cosh⁻¹(
+71 : tanh⁻¹(
+72 : e^
+73 : 10^
+74 : √(
+75 : ln(
+76 : ³√(
+77 : sin(
+78 : cos(
+79 : tan(
+7A : Arcsin(
+7B : Arccos(
+7C : Arctan(
+7D : log(
+7E : Pol
+7F : Rec
+83 : Ent(
+84 : EntEx(
+87 : RanInt#(
+88 : PGCD(
+89 : PPCM(
+8A : Arond(
+A5 : =
+A6 : +
+A7 : -
+A8 : ×
+A9 : ÷
+AA : ⊢
+AD : P
+AE : C
+C0 : -
+C8 : ⌋
+C9 : ^(
+CA : [x]√(
+D0 : )
+D4 : ⁻¹
+D5 : ²
+D6 : ³
+D7 : %
+D8 : !
+D9 : °
+DA : ʳ
+DB : ᵍ
+DC : °
+DD : E
+DE : P
+DF : T
+E0 : G
+E1 : M
+E2 : k
+E3 : m
+E4 : μ
+E5 : n
+E6 : p
+E7 : f
+E9 : ▶Simp
+F901 : end of line
+F902 : end of program
+F903 : nop / empty line
+F905 ... 00 : Avancer de ...
+F906 ... 00 : Tourner de ↺ ...
+F907 ... 00 : S'orienter à ...
+F908 ... 00 ... 00 : Aller à x=... ; y=...
+F909 : Stylo écrit
+F90A : Stylo relevé
+F90B ... 00 ... 00 : ... → ... (mettre var à)
+F90C ... 00 : ? → ... (Demander valeur)
+F90D3100 : "Oui"
+F90D3200 : "Non"
+F90D3300 : "Nombre?"
+F90D3400 : "Résultat:"
+F90E ... 00 : Afficher résult ...
+F90F3100 : Style Flèche
+F90F3200 : Style Croix
+F910 : Attendre
+F911 ... 00 : Répéter ...
+F912 : ⤴ (end of Répéter)
+F913 ... 00 : Répéter jusqu'à ...
+F914 : ⤴ (end of Répéter jusqu'à)
+F915 ... 00 : Si ... Alors [... Fin]
+F916 : Fin (end of Si/Alors)
+F917 ... 00 : Si ... Alors [... Sinon ... Fin]
+F918 : Sinon
+F919 : Fin (end of Si/Alors/Sinon)
+FB01 : <
+FB02 : >
+FB03 : ≠
+FB04 : ≤
+FB05 : ≥
+FB10 : →M
+FB12 : →A
+FB13 : →B
+FB14 : →C
+FB15 : →D
+FB16 : →E
+FB17 : →F
+FB18 : →x
+FB19 : →y
+FB1A : M+
+FB1B : M-
+FD18 : Ran#
--- a/fx92.py
+++ b/fx92.py
@ -0,0 +1,43 @@
+#! /usr/bin/python3
+
+import sys
+
+from parser import UrlParser
+from printer import print_ast
+
+#---
+# fx-92 SC+ interpreter
+#---
+
+pass
+
+#---
+# Main program
+#---
+
+usage_string = f"""
+usage: {sys.argv[0]} <wes.casio.com URL>
+""".strip()
+
+def usage(exitcode=None):
+    print(usage_string, file=sys.stderr)
+
+    if exitcode is not None:
+        sys.exit(exitcode)
+
+def main(argv):
+    args = argv[1:]
+
+    if not args or "-h" in args or "--help" in args or "-?" in args:
+        usage(0)
+    if len(args) != 1:
+        usage(1)
+
+    parser = UrlParser(args[0])
+    ast = parser.parse_program()
+    ast = ast.simplify()
+
+    print_ast(ast, lang="fr")
+
+if __name__ == "__main__":
+    main(sys.argv)
--- a/lexer.py
+++ b/lexer.py
@ -0,0 +1,200 @@
+# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
+
+import math
+import re
+import enum
+
+#---
+# Token description
+#---
+
+@enum.unique
+class T(enum.IntEnum):
+    # Basic commands
+    EOL = 0xF901
+    END = 0xF902
+    NOP = 0xF903
+
+    # Basic statements
+    FORWARD = 0xF905
+    ROTATE  = 0xF906
+    ORIENT  = 0xF907
+    GOTO    = 0xF908
+    PENDOWN = 0xF909
+    PENUP   = 0xF90A
+    SETVAR  = 0xF90B
+    INPUT   = 0xF90C
+    MESSAGE = 0xF90D
+    PRINT   = 0xF90E
+    STYLE   = 0xF90F
+    WAIT    = 0xF910
+
+    # Flow control
+    REPEAT     = 0xF911
+    REPEAT_END = 0xF912
+    WHILE      = 0xF913
+    WHILE_END  = 0xF914
+    IF         = 0xF915
+    IF_END     = 0xF916
+    IFELSE     = 0xF917
+    ELSE       = 0xF918
+    IFELSE_END = 0xF919
+
+    # Variable assignment
+    SETM = 0xFB10
+    SETA = 0xFB12
+    SETB = 0xFB13
+    SETC = 0xFB14
+    SETD = 0xFB15
+    SETE = 0xFB16
+    SETF = 0xFB17
+    SETX = 0xFB18
+    SETY = 0xFB19
+
+    # Miscellaneous
+    PARAM  = 0x00
+    COLON  = 0x23
+    QUEST  = 0x25
+    LPAR   = 0x60
+    RPAR   = 0xD0
+    EQUAL  = 0xA5
+    PLUS   = 0xA6
+    MINUS  = 0xA7
+    STAR   = 0xA8
+    SLASH  = 0xA9
+    BANG   = 0xD8
+
+    # Tokens with parameters
+    CONST = -1
+    VAR   = -2
+    REL   = -3
+
+class Token:
+    def __init__(self, type, *args):
+        """Instanciate a new token."""
+        self.type = type
+        self.args = args
+
+    def __repr__(self):
+        """Inambiguous token representation."""
+        try:
+            base = T(self.type).name
+        except ValueError:
+            base = f"<Token:{hex(self.type)}>"
+
+        if self.args:
+            args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
+        else:
+            args = ""
+
+        return base + args
+
+#---
+# Lexer
+#---
+
+class ByteLexer:
+    """
+    fx-92 SC+ language lexer with bytes() bitcode input.
+    """
+
+    def __init__(self, hex):
+        """Initialize the lexer with input data."""
+        self.hex = hex
+        self.rewind()
+
+    def rewind(self):
+        """Restart lexing the same input."""
+        self.pos = 0
+        self.errors = 0
+
+    def lex(self):
+        """Return the next token in the stream."""
+        h, p = self.hex, self.pos
+
+        if self.at_end():
+            return Token(T.END)
+
+        # 2-byte commands
+
+        if h[p] in [0xF9, 0xFB]:
+            # Stop if there is no trailing byte
+            if p >= len(h) - 1:
+                print(f"[lexer] Invalid trailing byte {hex(h[p])}")
+                p = len(h)
+                return Token(T.END)
+
+            # Return any value that is defined in the Token class
+            code = (h[p] << 8) | h[p+1]
+            self.pos += 2
+
+            try:
+                return Token(T(code))
+            except:
+                pass
+
+            # Also a few more values not in the Token class
+            rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
+            if h[p] == 0xFB and h[p+1] in rels:
+                return Token(T.REL, rels[h[p+1]])
+
+            print(f"[lexer] Unknown opcode {hex(code)}")
+            self.errors += 1
+
+            # Try to read another token
+            return self.lex()
+
+        # Single-byte characters
+
+        self.pos += 1
+        code = h[p]
+
+        # Translate unary minus to normal minus
+        if code == 0xC0:
+            code = 0xA7
+
+        try:
+            return Token(T(code))
+        except:
+            pass
+
+        if code == 0x21:
+            return Token(T.CONST, math.e)
+        if code == 0x22:
+            return Token(T.CONST, math.pi)
+
+        # Constants
+        if code in range(0x30, 0x39+1):
+            # Never thought pointer arithmetic would beat Python. Grr!
+            re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
+            match = re.match(re_const, h[p:])
+
+            if match is not None:
+                text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
+                self.pos += len(text) - 1
+
+                f = float(text.decode('utf-8'))
+                if match[2] == "%":
+                    f /= 100
+                return Token(T.CONST, f)
+
+        # Variables
+        if code in range(0x42, 0x47+1):
+            return Token(T.VAR, chr(h[p]-1))
+        if code == 0x40:
+            return Token(T.VAR, "M")
+        if code == 0x48:
+            return Token(T.VAR, "x")
+        if code == 0x49:
+            return Token(T.VAR, "y")
+
+        print(f"[lexer] Unknown opcode {hex(code)}")
+        self.errors += 1
+
+        # Try to read another token
+        return self.lex()
+
+    def at_end(self):
+        """Check whether the whole input has been read."""
+        return self.pos >= len(self.hex)
+
--- a/parser.py
+++ b/parser.py
@ -0,0 +1,283 @@
+# fx-92 Scientifique Collège+ language interpreter: Syntactic analysis
+
+import re
+from lexer import T, Token, ByteLexer
+from ast import N, Node
+
+#---
+# LL(1) parser
+#---
+
+class UrlParser:
+    """
+    fx-92 SC+ language parser with a wes.casio.com URL or hexadecimal input.
+    The URLs are typically in this form:
+
+        http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000
+        0000+S-000410110000100E0010B300D365+E-{code...}
+
+    The program can also be provided in text hexadecimal form, which is
+    everything following the "+E-" in the URL.
+
+    This is your everyday LL(1) top-down parser. It implements the following
+    formal grammar:
+
+    program -> stmt*
+    stmt    -> stmt2 EOL?
+    stmt2   -> FORWARD arg | ROTATE arg | ORIENT arg | GOTO arg arg |
+               PENDOWN | PENUP | SETVAR arg argvar | INPUT argvar |
+               MESSAGE arg | PRINT arg | STYLE style | WAIT arg |
+               REPEAT arg program REPEAT_END |
+               WHILE arg program WHILE_END |
+               IF arg program IF_END |
+               IFELSE arg program ELSE program IFELSE_END
+    arg     -> expr PARAM
+    argvar  -> var PARAM
+
+    expr    -> factor | factor + expr | factor - expr
+    factor  -> atom | atom * factor | atom / factor
+    atom    -> const (var | "(" expr ")")* | (var | "(" expr ")")+
+    const   -> (+|-)? CONST
+    var     -> VAR
+
+    # Unused or TODO
+    style   -> (TODO)
+    setvar  -> SETM | SETA | SETB | SETC | SETD | SETE | SETF | SETX | SETY
+    """
+
+    def __init__(self, url):
+        """
+        Create a UrlParser from a wes.casio.com URL or hexadecimal code.
+        """
+
+        if url.startswith("http://") \
+        or url.startswith("https://") \
+        or url.startswith("wes.casio.com"):
+            print("[urlparser] URL includes protocol, will start after '+E-'")
+            offset = url.find("+E-")
+
+            if offset < 0:
+                print("[urlparser] '+E-' not found, cannot decode URL")
+                raise Exception("Cannot decode URL")
+
+            url = url[offset+3:]
+
+        if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url):
+            print("[urlparser] URL is not strict hexa, noise will be skipped")
+
+        # Create the lexer and initialize the lookahead byte
+        self.lexer = ByteLexer(bytes.fromhex(url))
+        self.la = None
+
+    def lex_program(self):
+        self.lexer.rewind()
+
+        while not self.lexer.at_end():
+            x = self.lexer.lex()
+            print(x)
+
+    def parse_program(self):
+        self.lexer.rewind()
+        self.la = None
+        self.advance()
+        return self.program()
+
+    #---
+    # LL parsing helpers
+    #---
+
+    def advance(self):
+        """Return the next token and feed the lookahead."""
+        next = self.la
+        self.la = self.lexer.lex()
+        return next
+
+    def expect(self, *types, optional=False):
+        """Read the next token, expecting a type specified in *types."""
+
+        if self.la.type in types:
+            return self.advance()
+        if optional:
+            return None
+
+        expected = [T(t).name for t in types]
+        got = T(self.la.type).name
+        err = f"Expected one of {expected}, got {got}"
+        print("[urlparser] " + err)
+        raise Exception("Syntax error: " + err)
+
+    #---
+    # LL parsing rules
+    #---
+
+    # program -> stmt*
+    def program(self):
+        stmts = []
+
+        while 1:
+            stmt = self.stmt(optional=True)
+            if stmt is None:
+                break
+            stmts.append(stmt)
+
+        return Node(N.PROGRAM, *stmts)
+
+    # stmt -> stmt2 EOL?
+    def stmt(self, optional=False):
+        st = self.stmt2(optional=optional)
+        self.expect(T.EOL, optional=True)
+        return st
+
+    # stmt -> (lots of cases)
+    def stmt2(self, optional=False):
+        valid = [
+            T.FORWARD, T.ROTATE, T.ORIENT,  T.GOTO,  T.PENDOWN, T.PENUP,
+            T.SETVAR,  T.INPUT,  T.MESSAGE, T.PRINT, T.STYLE,   T.WAIT,
+            T.REPEAT,  T.WHILE,  T.IF,      T.IFELSE,
+        ]
+        op = self.expect(*valid, optional=optional)
+
+        if op is None:
+            return None
+
+        # Basic statements
+        if op.type == T.FORWARD:
+            return Node(N.FORWARD, self.arg())
+        if op.type == T.ROTATE:
+            return Node(N.ROTATE, self.arg())
+        if op.type == T.ORIENT:
+            return Node(N.ORIENT, self.arg())
+        if op.type == T.GOTO:
+            return Node(N.GOTO, self.arg(), self.arg())
+        if op.type == T.PENDOWN:
+            return Node(N.PENDOWN)
+        if op.type == T.PENUP:
+            return Node(N.PENUP)
+        if op.type == T.SETVAR:
+            return Node(N.ASSIGN, self.arg(), self.argvar())
+        if op.type == T.INPUT:
+            return Node(N.INPUT, self.argvar())
+        if op.type == T.MESSAGE:
+            return Node(N.MESSAGE, self.arg())
+        if op.type == T.PRINT:
+            return Node(N.PRINT, self.arg())
+        if op.type == T.STYLE:
+            return Node(N.STYLE, self.style())
+        if op.type == T.WAIT:
+            return Node(N.WAIT, self.arg())
+
+        # Flow control
+
+        if op.type == T.REPEAT:
+            arg = self.arg()
+            self.expect(T.EOL)
+            prg = self.program()
+            self.expect(T.REPEAT_END)
+            return Node(N.REPEAT, arg, prg)
+
+        if op.type == T.WHILE:
+            arg = self.arg()
+            self.expect(T.EOL)
+            prg = self.program()
+            self.expect(T.WHILE_END)
+            return Node(N.WHILE, arg, prg)
+
+        if op.type == T.IF:
+            arg = self.arg()
+            self.expect(T.EOL)
+            prg = self.program()
+            self.expect(T.IF_END)
+            return Node(N.IF, arg, prg, None)
+
+        if op.type == T.IFELSE:
+            arg = self.arg()
+            self.expect(T.EOL)
+            p1 = self.program()
+            self.expect(T.ELSE)
+            p2 = self.program()
+            self.expect(T.IFELSE_END)
+            return Node(N.IF, arg, p1, p2)
+
+    # arg -> expr PARAM
+    def arg(self):
+        e = self.expr()
+        self.expect(T.PARAM)
+        return e
+
+    # expr -> factor | factor + expr | factor - expr
+    def expr(self):
+        factor = self.factor()
+        t = self.expect(T.PLUS, T.MINUS, optional=True)
+
+        if t is None:
+            return factor
+        if t.type == T.PLUS:
+            return Node(N.ADD, factor, self.expr())
+        if t.type == T.MINUS:
+            return Node(N.SUB, factor, self.expr())
+
+    # factor -> atom | atom * factor | atom / factor
+    def factor(self):
+        atom = self.atom()
+        t = self.expect(T.STAR, T.SLASH, optional=True)
+
+        if t is None:
+            return atom
+        if t.type == T.STAR:
+            return Node(N.MUL, atom, self.factor())
+        if t.type == T.SLASH:
+            return Node(N.DIV, atom, self.factor())
+
+    # atom -> const (VAR | "(" expr ")")* | (VAR | "(" expr ")")+
+    def atom(self):
+        factors = []
+        lat = self.la.type
+
+        # Case of constants
+        if lat == T.PLUS or lat == T.MINUS or lat == T.CONST:
+            factors.append(self.const())
+
+        while 1:
+            lat = self.la.type
+
+            if lat == T.VAR:
+                factors.append(self.var())
+            elif lat == T.LPAR:
+                self.expect(T.LPAR)
+                factors.append(self.expr())
+                self.expect(T.RPAR)
+            else:
+                break
+
+        return Node(N.MUL, *factors)
+
+    # const -> (+|-)? CONST
+    def const(self):
+        t = self.expect(T.PLUS, T.MINUS, optional=True)
+        const = self.expect(T.CONST).args[0]
+
+        const = Node(N.CONST, const)
+        if t and t.type == T.MINUS:
+            const = Node(N.MINUS, const)
+
+        return const
+
+    # argvar -> var PARAM
+    def argvar(self):
+        n = self.var()
+        self.expect(T.PARAM)
+        return n
+
+    # var -> VAR
+    def var(self):
+        t = self.expect(T.VAR)
+        return Node(N.VAR, t.args[0])
+
+    # setvar -> SETM | SETA | ... | SETF | SETX | SETY
+    def setvar(self):
+        raise Exception("SetVar not supported yet x_x")
+
+    # style -> (TODO)
+    def style(self):
+        raise Exception("Style not supported yet x_x")
+
--- a/printer.py
+++ b/printer.py
@ -0,0 +1,38 @@
+# fx-92 Scientifique Collège+ language interpreter: AST printer
+
+from ast import N, Node
+__all__ = ["print_ast"]
+
+#---
+# Message definitions
+#---
+
+class MessageFrench:
+    multiply = "mul({})"
+    goto = "goto {}, {}"
+
+class MessageEnglish:
+    pass
+
+#---
+# Printer
+#---
+
+def print_ast(n, lang="en", indent=0):
+    if lang == "fr": lang = MessageFrench
+    if lang == "en": lang = MessageEnglish
+
+    print(" " * indent, end="")
+
+    if not isinstance(n, Node):
+        print(f"{type(n)}({n})")
+        return
+
+    if n.type == N.CONST:
+        print(n.args[0])
+    elif n.type == N.VAR:
+        print(f"VAR({n.args[0]})")
+    else:
+        print(f"{n.type.name}")
+        for arg in n.args:
+            print_ast(arg, lang=lang, indent=indent+2)