initial commit: most of the lexing and parsing work

The program is currently able to lex most useful tokens, and parse
constructs associated with them on simple examples.

Unit tests are still missing to formally ensure everything's right.
This commit is contained in:
Lephe 2019-09-30 11:29:05 +02:00
commit 6151144d0a
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
7 changed files with 780 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
# Python bytecodes
__pycache__
# Semantic exclusion
exclude

74
ast.py Normal file
View File

@ -0,0 +1,74 @@
# fx-92 Scientifique Collège+ language interpreter: AST definition
import enum
#---
# Internal AST node representation
#---
@enum.unique
class N(enum.IntEnum):
# Core nodes
PROGRAM = enum.auto()
# Basic statements
FORWARD = enum.auto()
ROTATE = enum.auto()
ORIENT = enum.auto()
GOTO = enum.auto()
PENDOWN = enum.auto()
PENUP = enum.auto()
ASSIGN = enum.auto()
INPUT = enum.auto()
MESSAGE = enum.auto()
PRINT = enum.auto()
STYLE = enum.auto()
WAIT = enum.auto()
# Flow control
REPEAT = enum.auto()
WHILE = enum.auto()
IFELSE = enum.auto()
# Expressions
ADD = enum.auto()
SUB = enum.auto()
MUL = enum.auto()
DIV = enum.auto()
MINUS = enum.auto()
EXP = enum.auto()
VAR = enum.auto()
CONST = enum.auto()
#---
# AST nodes
#---
class Node:
def __init__(self, type, *args):
"""Instantiate a new AST node."""
self.type = type
self.args = args
def __str__(self):
try:
name = N(self.type).name
return f"<Node:{name}>"
except ValueError:
return f"<Node:{hex(self.type)}>"
def simplify(self):
simpl = lambda n: n.simplify() if isinstance(n, Node) else n
self.args = [ simpl(arg) for arg in self.args ]
arity = len(self.args)
if self.type == N.MUL and arity == 0:
return Node(N.CONST, 1)
if self.type == N.MUL and arity == 1:
return self.args[0]
if self.type == N.ADD and arity == 0:
return Node(N.CONST, 0)
if self.type == N.ADD and arity == 1:
return self.args[0]
return self

137
doc/tokens.txt Normal file
View File

@ -0,0 +1,137 @@
21 : e
22 : π
23 : :
25 : ?
2C : ;
2D : ×10
2E : ,
30 : 0
31 : 1
32 : 2
33 : 3
34 : 4
35 : 5
36 : 6
37 : 7
38 : 8
39 : 9
40 : M
41 : Rép (Ans)
42 : A
43 : B
44 : C
45 : D
46 : E
47 : F
48 : x
49 : y
4A : Pré-Rép
4C : θ
60 : (
68 : Abs(
69 : Rnd(
6C : sinh(
6D : cosh(
6E : tanh(
6F : sinh⁻¹(
70 : cosh⁻¹(
71 : tanh⁻¹(
72 : e^
73 : 10^
74 : √(
75 : ln(
76 : ³√(
77 : sin(
78 : cos(
79 : tan(
7A : Arcsin(
7B : Arccos(
7C : Arctan(
7D : log(
7E : Pol
7F : Rec
83 : Ent(
84 : EntEx(
87 : RanInt#(
88 : PGCD(
89 : PPCM(
8A : Arond(
A5 : =
A6 : +
A7 : -
A8 : ×
A9 : ÷
AA : ⊢
AD : P
AE : C
C0 : -
C8 : ⌋
C9 : ^(
CA : [x]√(
D0 : )
D4 : ⁻¹
D5 : ²
D6 : ³
D7 : %
D8 : !
D9 : °
DA : ʳ
DB : ᵍ
DC : °
DD : E
DE : P
DF : T
E0 : G
E1 : M
E2 : k
E3 : m
E4 : μ
E5 : n
E6 : p
E7 : f
E9 : ▶Simp
F901 : end of line
F902 : end of program
F903 : nop / empty line
F905 ... 00 : Avancer de ...
F906 ... 00 : Tourner de ↺ ...
F907 ... 00 : S'orienter à ...
F908 ... 00 ... 00 : Aller à x=... ; y=...
F909 : Stylo écrit
F90A : Stylo relevé
F90B ... 00 ... 00 : ... → ... (mettre var à)
F90C ... 00 : ? → ... (Demander valeur)
F90D3100 : "Oui"
F90D3200 : "Non"
F90D3300 : "Nombre?"
F90D3400 : "Résultat:"
F90E ... 00 : Afficher résult ...
F90F3100 : Style Flèche
F90F3200 : Style Croix
F910 : Attendre
F911 ... 00 : Répéter ...
F912 : ⤴ (end of Répéter)
F913 ... 00 : Répéter jusqu'à ...
F914 : ⤴ (end of Répéter jusqu'à)
F915 ... 00 : Si ... Alors [... Fin]
F916 : Fin (end of Si/Alors)
F917 ... 00 : Si ... Alors [... Sinon ... Fin]
F918 : Sinon
F919 : Fin (end of Si/Alors/Sinon)
FB01 : <
FB02 : >
FB03 : ≠
FB04 : ≤
FB05 : ≥
FB10 : →M
FB12 : →A
FB13 : →B
FB14 : →C
FB15 : →D
FB16 : →E
FB17 : →F
FB18 : →x
FB19 : →y
FB1A : M+
FB1B : M-
FD18 : Ran#

43
fx92.py Executable file
View File

@ -0,0 +1,43 @@
#! /usr/bin/python3
import sys
from parser import UrlParser
from printer import print_ast
#---
# fx-92 SC+ interpreter
#---
pass
#---
# Main program
#---
usage_string = f"""
usage: {sys.argv[0]} <wes.casio.com URL>
""".strip()
def usage(exitcode=None):
print(usage_string, file=sys.stderr)
if exitcode is not None:
sys.exit(exitcode)
def main(argv):
args = argv[1:]
if not args or "-h" in args or "--help" in args or "-?" in args:
usage(0)
if len(args) != 1:
usage(1)
parser = UrlParser(args[0])
ast = parser.parse_program()
ast = ast.simplify()
print_ast(ast, lang="fr")
if __name__ == "__main__":
main(sys.argv)

200
lexer.py Normal file
View File

@ -0,0 +1,200 @@
# fx-92 Scientifique Collège+ language interpreter: Lexical analysis
import math
import re
import enum
#---
# Token description
#---
@enum.unique
class T(enum.IntEnum):
# Basic commands
EOL = 0xF901
END = 0xF902
NOP = 0xF903
# Basic statements
FORWARD = 0xF905
ROTATE = 0xF906
ORIENT = 0xF907
GOTO = 0xF908
PENDOWN = 0xF909
PENUP = 0xF90A
SETVAR = 0xF90B
INPUT = 0xF90C
MESSAGE = 0xF90D
PRINT = 0xF90E
STYLE = 0xF90F
WAIT = 0xF910
# Flow control
REPEAT = 0xF911
REPEAT_END = 0xF912
WHILE = 0xF913
WHILE_END = 0xF914
IF = 0xF915
IF_END = 0xF916
IFELSE = 0xF917
ELSE = 0xF918
IFELSE_END = 0xF919
# Variable assignment
SETM = 0xFB10
SETA = 0xFB12
SETB = 0xFB13
SETC = 0xFB14
SETD = 0xFB15
SETE = 0xFB16
SETF = 0xFB17
SETX = 0xFB18
SETY = 0xFB19
# Miscellaneous
PARAM = 0x00
COLON = 0x23
QUEST = 0x25
LPAR = 0x60
RPAR = 0xD0
EQUAL = 0xA5
PLUS = 0xA6
MINUS = 0xA7
STAR = 0xA8
SLASH = 0xA9
BANG = 0xD8
# Tokens with parameters
CONST = -1
VAR = -2
REL = -3
class Token:
def __init__(self, type, *args):
"""Instanciate a new token."""
self.type = type
self.args = args
def __repr__(self):
"""Inambiguous token representation."""
try:
base = T(self.type).name
except ValueError:
base = f"<Token:{hex(self.type)}>"
if self.args:
args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
else:
args = ""
return base + args
#---
# Lexer
#---
class ByteLexer:
"""
fx-92 SC+ language lexer with bytes() bitcode input.
"""
def __init__(self, hex):
"""Initialize the lexer with input data."""
self.hex = hex
self.rewind()
def rewind(self):
"""Restart lexing the same input."""
self.pos = 0
self.errors = 0
def lex(self):
"""Return the next token in the stream."""
h, p = self.hex, self.pos
if self.at_end():
return Token(T.END)
# 2-byte commands
if h[p] in [0xF9, 0xFB]:
# Stop if there is no trailing byte
if p >= len(h) - 1:
print(f"[lexer] Invalid trailing byte {hex(h[p])}")
p = len(h)
return Token(T.END)
# Return any value that is defined in the Token class
code = (h[p] << 8) | h[p+1]
self.pos += 2
try:
return Token(T(code))
except:
pass
# Also a few more values not in the Token class
rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
if h[p] == 0xFB and h[p+1] in rels:
return Token(T.REL, rels[h[p+1]])
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
# Single-byte characters
self.pos += 1
code = h[p]
# Translate unary minus to normal minus
if code == 0xC0:
code = 0xA7
try:
return Token(T(code))
except:
pass
if code == 0x21:
return Token(T.CONST, math.e)
if code == 0x22:
return Token(T.CONST, math.pi)
# Constants
if code in range(0x30, 0x39+1):
# Never thought pointer arithmetic would beat Python. Grr!
re_const = rb'([0-9]+(?:\x2E[0-9]*)?(?:\x2D[0-9]+)?)(%?)'
match = re.match(re_const, h[p:])
if match is not None:
text = match[1].replace(b'\x2E', b'.').replace(b'\x2D', b'e')
self.pos += len(text) - 1
f = float(text.decode('utf-8'))
if match[2] == "%":
f /= 100
return Token(T.CONST, f)
# Variables
if code in range(0x42, 0x47+1):
return Token(T.VAR, chr(h[p]-1))
if code == 0x40:
return Token(T.VAR, "M")
if code == 0x48:
return Token(T.VAR, "x")
if code == 0x49:
return Token(T.VAR, "y")
print(f"[lexer] Unknown opcode {hex(code)}")
self.errors += 1
# Try to read another token
return self.lex()
def at_end(self):
"""Check whether the whole input has been read."""
return self.pos >= len(self.hex)

283
parser.py Normal file
View File

@ -0,0 +1,283 @@
# fx-92 Scientifique Collège+ language interpreter: Syntactic analysis
import re
from lexer import T, Token, ByteLexer
from ast import N, Node
#---
# LL(1) parser
#---
class UrlParser:
"""
fx-92 SC+ language parser with a wes.casio.com URL or hexadecimal input.
The URLs are typically in this form:
http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000
0000+S-000410110000100E0010B300D365+E-{code...}
The program can also be provided in text hexadecimal form, which is
everything following the "+E-" in the URL.
This is your everyday LL(1) top-down parser. It implements the following
formal grammar:
program -> stmt*
stmt -> stmt2 EOL?
stmt2 -> FORWARD arg | ROTATE arg | ORIENT arg | GOTO arg arg |
PENDOWN | PENUP | SETVAR arg argvar | INPUT argvar |
MESSAGE arg | PRINT arg | STYLE style | WAIT arg |
REPEAT arg program REPEAT_END |
WHILE arg program WHILE_END |
IF arg program IF_END |
IFELSE arg program ELSE program IFELSE_END
arg -> expr PARAM
argvar -> var PARAM
expr -> factor | factor + expr | factor - expr
factor -> atom | atom * factor | atom / factor
atom -> const (var | "(" expr ")")* | (var | "(" expr ")")+
const -> (+|-)? CONST
var -> VAR
# Unused or TODO
style -> (TODO)
setvar -> SETM | SETA | SETB | SETC | SETD | SETE | SETF | SETX | SETY
"""
def __init__(self, url):
"""
Create a UrlParser from a wes.casio.com URL or hexadecimal code.
"""
if url.startswith("http://") \
or url.startswith("https://") \
or url.startswith("wes.casio.com"):
print("[urlparser] URL includes protocol, will start after '+E-'")
offset = url.find("+E-")
if offset < 0:
print("[urlparser] '+E-' not found, cannot decode URL")
raise Exception("Cannot decode URL")
url = url[offset+3:]
if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url):
print("[urlparser] URL is not strict hexa, noise will be skipped")
# Create the lexer and initialize the lookahead byte
self.lexer = ByteLexer(bytes.fromhex(url))
self.la = None
def lex_program(self):
self.lexer.rewind()
while not self.lexer.at_end():
x = self.lexer.lex()
print(x)
def parse_program(self):
self.lexer.rewind()
self.la = None
self.advance()
return self.program()
#---
# LL parsing helpers
#---
def advance(self):
"""Return the next token and feed the lookahead."""
next = self.la
self.la = self.lexer.lex()
return next
def expect(self, *types, optional=False):
"""Read the next token, expecting a type specified in *types."""
if self.la.type in types:
return self.advance()
if optional:
return None
expected = [T(t).name for t in types]
got = T(self.la.type).name
err = f"Expected one of {expected}, got {got}"
print("[urlparser] " + err)
raise Exception("Syntax error: " + err)
#---
# LL parsing rules
#---
# program -> stmt*
def program(self):
stmts = []
while 1:
stmt = self.stmt(optional=True)
if stmt is None:
break
stmts.append(stmt)
return Node(N.PROGRAM, *stmts)
# stmt -> stmt2 EOL?
def stmt(self, optional=False):
st = self.stmt2(optional=optional)
self.expect(T.EOL, optional=True)
return st
# stmt -> (lots of cases)
def stmt2(self, optional=False):
valid = [
T.FORWARD, T.ROTATE, T.ORIENT, T.GOTO, T.PENDOWN, T.PENUP,
T.SETVAR, T.INPUT, T.MESSAGE, T.PRINT, T.STYLE, T.WAIT,
T.REPEAT, T.WHILE, T.IF, T.IFELSE,
]
op = self.expect(*valid, optional=optional)
if op is None:
return None
# Basic statements
if op.type == T.FORWARD:
return Node(N.FORWARD, self.arg())
if op.type == T.ROTATE:
return Node(N.ROTATE, self.arg())
if op.type == T.ORIENT:
return Node(N.ORIENT, self.arg())
if op.type == T.GOTO:
return Node(N.GOTO, self.arg(), self.arg())
if op.type == T.PENDOWN:
return Node(N.PENDOWN)
if op.type == T.PENUP:
return Node(N.PENUP)
if op.type == T.SETVAR:
return Node(N.ASSIGN, self.arg(), self.argvar())
if op.type == T.INPUT:
return Node(N.INPUT, self.argvar())
if op.type == T.MESSAGE:
return Node(N.MESSAGE, self.arg())
if op.type == T.PRINT:
return Node(N.PRINT, self.arg())
if op.type == T.STYLE:
return Node(N.STYLE, self.style())
if op.type == T.WAIT:
return Node(N.WAIT, self.arg())
# Flow control
if op.type == T.REPEAT:
arg = self.arg()
self.expect(T.EOL)
prg = self.program()
self.expect(T.REPEAT_END)
return Node(N.REPEAT, arg, prg)
if op.type == T.WHILE:
arg = self.arg()
self.expect(T.EOL)
prg = self.program()
self.expect(T.WHILE_END)
return Node(N.WHILE, arg, prg)
if op.type == T.IF:
arg = self.arg()
self.expect(T.EOL)
prg = self.program()
self.expect(T.IF_END)
return Node(N.IF, arg, prg, None)
if op.type == T.IFELSE:
arg = self.arg()
self.expect(T.EOL)
p1 = self.program()
self.expect(T.ELSE)
p2 = self.program()
self.expect(T.IFELSE_END)
return Node(N.IF, arg, p1, p2)
# arg -> expr PARAM
def arg(self):
e = self.expr()
self.expect(T.PARAM)
return e
# expr -> factor | factor + expr | factor - expr
def expr(self):
factor = self.factor()
t = self.expect(T.PLUS, T.MINUS, optional=True)
if t is None:
return factor
if t.type == T.PLUS:
return Node(N.ADD, factor, self.expr())
if t.type == T.MINUS:
return Node(N.SUB, factor, self.expr())
# factor -> atom | atom * factor | atom / factor
def factor(self):
atom = self.atom()
t = self.expect(T.STAR, T.SLASH, optional=True)
if t is None:
return atom
if t.type == T.STAR:
return Node(N.MUL, atom, self.factor())
if t.type == T.SLASH:
return Node(N.DIV, atom, self.factor())
# atom -> const (VAR | "(" expr ")")* | (VAR | "(" expr ")")+
def atom(self):
factors = []
lat = self.la.type
# Case of constants
if lat == T.PLUS or lat == T.MINUS or lat == T.CONST:
factors.append(self.const())
while 1:
lat = self.la.type
if lat == T.VAR:
factors.append(self.var())
elif lat == T.LPAR:
self.expect(T.LPAR)
factors.append(self.expr())
self.expect(T.RPAR)
else:
break
return Node(N.MUL, *factors)
# const -> (+|-)? CONST
def const(self):
t = self.expect(T.PLUS, T.MINUS, optional=True)
const = self.expect(T.CONST).args[0]
const = Node(N.CONST, const)
if t and t.type == T.MINUS:
const = Node(N.MINUS, const)
return const
# argvar -> var PARAM
def argvar(self):
n = self.var()
self.expect(T.PARAM)
return n
# var -> VAR
def var(self):
t = self.expect(T.VAR)
return Node(N.VAR, t.args[0])
# setvar -> SETM | SETA | ... | SETF | SETX | SETY
def setvar(self):
raise Exception("SetVar not supported yet x_x")
# style -> (TODO)
def style(self):
raise Exception("Style not supported yet x_x")

38
printer.py Normal file
View File

@ -0,0 +1,38 @@
# fx-92 Scientifique Collège+ language interpreter: AST printer
from ast import N, Node
__all__ = ["print_ast"]
#---
# Message definitions
#---
class MessageFrench:
multiply = "mul({})"
goto = "goto {}, {}"
class MessageEnglish:
pass
#---
# Printer
#---
def print_ast(n, lang="en", indent=0):
if lang == "fr": lang = MessageFrench
if lang == "en": lang = MessageEnglish
print(" " * indent, end="")
if not isinstance(n, Node):
print(f"{type(n)}({n})")
return
if n.type == N.CONST:
print(n.args[0])
elif n.type == N.VAR:
print(f"VAR({n.args[0]})")
else:
print(f"{n.type.name}")
for arg in n.args:
print_ast(arg, lang=lang, indent=indent+2)