fx92-interpreter/fx92/lexer.py

# fx-92 Scientifique Collège+ language interpreter: Lexical analysis

import math
import re
import enum
from decimal import Decimal

#---
# Token description
#---

@enum.unique
class T(enum.IntEnum):
    # Basic commands
    EOL = 0xF901
    END = 0xF902
    NOP = 0xF903

    # Basic statements
    FORWARD = 0xF905
    ROTATE  = 0xF906
    ORIENT  = 0xF907
    GOTO    = 0xF908
    PENDOWN = 0xF909
    PENUP   = 0xF90A
    SETVAR  = 0xF90B
    INPUT   = 0xF90C
    MESSAGE = 0xF90D
    PRINT   = 0xF90E
    STYLE   = 0xF90F
    WAIT    = 0xF910

    # Flow control
    REPEAT     = 0xF911
    REPEAT_END = 0xF912
    UNTIL      = 0xF913
    UNTIL_END  = 0xF914
    IF         = 0xF915
    IF_END     = 0xF916
    IFELSE     = 0xF917
    ELSE       = 0xF918
    IFELSE_END = 0xF919

    # Variable assignment
    SETM = 0xFB10
    SETA = 0xFB12
    SETB = 0xFB13
    SETC = 0xFB14
    SETD = 0xFB15
    SETE = 0xFB16
    SETF = 0xFB17
    SETX = 0xFB18
    SETY = 0xFB19

    # Miscellaneous
    PARAM  = 0x00
    COLON  = 0x23
    QUEST  = 0x25
    SEMI   = 0x2C
    LPAR   = 0x60
    RPAR   = 0xD0
    PLUS   = 0xA6
    MINUS  = 0xA7
    STAR   = 0xA8
    SLASH  = 0xA9
    EXP    = 0xC9
    BANG   = 0xD8

    # Tokens with parameters
    CONST = -1
    VAR   = -2
    REL   = -3
    FUN   = -4

class Token:
    def __init__(self, type, *args):
        """Instanciate a new token."""
        self.type = type
        self.args = args

    def __repr__(self):
        """Inambiguous token representation."""
        try:
            base = T(self.type).name
        except ValueError:
            base = "<Token:{}>".format(hex(self.type))

        if self.type == T.CONST:
            args = "({}) [typed as {}]".format(*self.args)
        elif self.args:
            args = "(" + ",".join(repr(arg) for arg in self.args) + ")"
        else:
            args = ""

        return base + args

#---
# Utilities
#---

def str2float(integer, decimal, exponent, percent):
    m1 = integer or "0"
    m2 = ".0" if decimal == "." else (decimal or "")
    m3 = exponent or ""

    f = Decimal(m1 + m2 + m3)

    if percent == "%":
        f /= 100

    return f

#---
# Lexer base
#---

class LexerBase:
    """
    Lexer base class. This class only provides common methods and cannot be
    used to analyse a program.
    """

    def dump(self):
        self.rewind()

        while not self.at_end():
            x = self.lex()
            print("{:5d}: {}".format(self.position, x))

#---
# Bitcode lexer
#---

class BitcodeLexer(LexerBase):
    """
    fx-92 SC+ language lexer with bytes() bitcode input.
    """

    def __init__(self, hex):
        """Initialize the lexer with input data."""
        self.hex = hex
        self.rewind()

    def rewind(self):
        """Restart lexing the same input."""
        self.pos = 0
        self.position = 0
        self.errors = 0

    def lex(self):
        """Return the next token in the stream."""
        h, p = self.hex, self.pos
        self.position += 1

        if self.at_end():
            return Token(T.END)

        # 2-byte commands

        if h[p] in [0xF9, 0xFB]:
            # Stop if there is no trailing byte
            if p >= len(h) - 1:
                print("[lexer] Invalid trailing byte {}".format(hex(h[p])))
                p = len(h)
                return Token(T.END)

            # Return any value that is defined in the Token class
            code = (h[p] << 8) | h[p+1]
            self.pos += 2

            try:
                return Token(T(code))
            except:
                pass

            # Also a few more values not in the Token class
            rels = { 0x01: "<", 0x02: ">", 0x03: "!=", 0x04: "<=", 0x05: ">=" }
            if h[p] == 0xFB and h[p+1] in rels:
                return Token(T.REL, rels[h[p+1]])

            print("[lexer] Unknown opcode {}".format(hex(code)))
            self.errors += 1

            # Try to read another token
            return self.lex()

        # Single-byte characters

        self.pos += 1
        code = h[p]

        # Translate unary minus to normal minus
        if code == 0xC0:
            code = 0xA7

        # Equal symbol
        if h[p] == 0xA5:
            return Token(T.REL, "=")

        try:
            return Token(T(code))
        except:
            pass

        if code == 0x21:
            return Token(T.CONST, Decimal(math.e), "[e]")
        if code == 0x22:
            return Token(T.CONST, Decimal(math.pi), "[pi]")

        # Constants
        if code in range(0x30, 0x39+1) or code == 0x2E:
            # Never thought pointer arithmetic would beat Python. Grr!
            re_const=rb'([0-9]*)(\x2E[0-9]*)?(\x2D[\xA6\xA7\xC0]?[0-9]+)?(%)?'
            m = re.match(re_const, h[p:])

            if m is not None:
                integer = (m[1] or b"").decode()
                decimal = (m[2] or b".").replace(b'\x2E', b'.').decode()
                exp     = (m[3] or b"")
                percent = (m[4] or b"").decode()

                exp = exp.replace(b'\x2D', b'e')
                exp = exp.replace(b'\xA6', b'+')
                exp = exp.replace(b'\xA7', b'-')
                exp = exp.replace(b'\xC0', b'-')
                exp = exp.decode()

                self.pos += len(m[0]) - 1
                f = str2float(integer, decimal, exp, percent)
                return Token(T.CONST, f, m[0])

        # Variables
        if code in range(0x42, 0x47+1):
            return Token(T.VAR, chr(h[p]-1))
        if code == 0x40:
            return Token(T.VAR, "M")
        if code == 0x48:
            return Token(T.VAR, "x")
        if code == 0x49:
            return Token(T.VAR, "y")
        if code == 0x4C:
            return Token(T.VAR, "theta")

        # Functions
        fun = {
            0x68: "Abs",
            0x69: "Rnd",
            0x6C: "sinh",
            0x6D: "cosh",
            0x6E: "tanh",
            0x6F: "asinh",
            0x70: "acosh",
            0x71: "atanh",
#            0x72: "exp",
#            0x73: "exp10",
            0x74: "sqrt",
            0x75: "log",
            0x76: "cbrt",
            0x77: "sin",
            0x78: "cos",
            0x79: "tan",
            0x7A: "asin",
            0x7B: "acos",
            0x7C: "atan",
            0x7D: "log10",
            0x83: "Ent",
            0x84: "EntEx",
            0x87: "RanInt",
            0x88: "GCD",
            0x89: "LCM",
            0x8A: "Arond",
        }

        if code in fun:
            return Token(T.FUN, fun[code])

        print("[lexer] Unknown opcode {}".format(hex(code)))
        self.errors += 1

        # Try to read another token after skipping one byte
        self.position -= 1
        return self.lex()

    def at_end(self):
        """Check whether the whole input has been read."""
        return self.pos >= len(self.hex)

#---
# Url lexer
#---

class UrlLexer(BitcodeLexer):
    """
    fx-92 SC+ language lexer with a wes.casio.com URL or hexadecimal input.
    The URLs are typically in this form:

        http://wes.casio.com/math/index.php?q=I-295A+U-000000000000+M-0E0000
        0000+S-000410110000100E0010B300D365+E-{code...}

    The program can also be provided in text hexadecimal form, which is
    everything following the "+E-" in the URL.
    """

    def __init__(self, url):
        if url.startswith("http://") \
        or url.startswith("https://") \
        or url.startswith("wes.casio.com"):
            print("[urlparser] URL includes protocol, will start after '+E-'")
            offset = url.find("+E-")

            if offset < 0:
                print("[urlparser] '+E-' not found, cannot decode URL")
                raise Exception("Cannot decode URL")

            url = url[offset+3:]

        if not re.fullmatch(r'(?:[0-9a-fA-F]{2})+', url):
            print("[urlparser] URL is not strict hexa, noise will be skipped")
        super().__init__(bytes.fromhex(url))

#---
# Plain text lexer
#---

class TextLexer(LexerBase):
    """
    fx-92 SC+ language lexer with Basic-like input.

    This thing is very naive and extremely inefficient.
    """

    RE_STMTS = re.compile(
        r"NOP|FORWARD|ROTATE|ORIENT|GOTO|PENDOWN|PENUP|SETVAR|INPUT|MESSAGE|"
        r"PRINT|STYLE|WAIT|REPEAT_END|REPEAT|UNTIL_END|UNTIL|IF_END|ELSE|"
        r"IFELSE_END|IFELSE|IF",
        re.IGNORECASE)
    RE_CONST = re.compile(
        r"([0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?(%)?")
    RE_FUN = re.compile(
        r"([a-zA-Z]+)\(")

    def __init__(self, code):
        """Initialize the lexer with text code."""

        self.base_code = code
        self.rewind()

    def rewind(self):
        """Restart lexing the same input."""

        self.code = self.base_code
        self.position = 0
        self.errors = 0
        self.pending_param = False

    def lex(self):
        """Return the next token in the stream."""
        self.position += 1

        c = self.code.lstrip(" \t")

        # Special case of newlines. If a non-statement has been identified and
        # no comma has followed, emit a PARAM token manually.
        if (not c or c[0] == "\n") and self.pending_param:
            self.pending_param = False
            self.code = c.lstrip("\n")
            return Token(T.PARAM)

        c = self.code.lstrip(" \t\n")

        # End of file
        if not c:
            self.code = ""
            return Token(T.END)

        # Statements
        m = re.match(self.RE_STMTS, c)
        if m is not None:
            t = Token(getattr(T, m[0].upper()))
            self.code = c[len(m[0]):]
            return t

        # Relations
        rels = [ ">=", "<=", "!=", ">", "<" ]
        for r in rels:
            if c.startswith(r):
                self.code = c[len(r):]
                self.pending_param = True
                return Token(T.REL, r)
        if c[0] == "=":
            self.code = c[1:]
            self.pending_param = True
            return Token(T.REL, "=")

        # Punctuation
        punct = {
            ",": T.PARAM,
            ":": T.COLON,
            ";": T.SEMI,
            "?": T.QUEST,
            "(": T.LPAR,
            ")": T.RPAR,
            "+": T.PLUS,
            "-": T.MINUS,
            "*": T.STAR,
            "/": T.SLASH,
            "!": T.BANG,
        }
        if c[0] in punct:
            self.code = c[1:]
            self.pending_param = (c[0] != ",")
            return Token(punct[c[0]])

        # Constants
        if c[0] in "0123456789.":
            m = re.match(self.RE_CONST, c)
            if m is not None:
                f = str2float(m[1], m[2], m[3], m[4])

            self.code = c[len(m[0]):]
            self.pending_param = True
            return Token(T.CONST, f, m[0])

        # Functions
        m = re.match(self.RE_FUN, c)
        if m is not None:
            self.code = c[len(m[0]):]
            self.pending_param = True
            return Token(T.FUN, m[1])

        # Variables
        if c[0] in "MABCDEFxXyY":
            var = c[0].lower() if c[0] in "xXyY" else c[0]
            self.code = c[1:]
            self.pending_param = True
            return Token(T.VAR, c[0])
        m = re.match(r"theta\b", c)
        if m is not None:
            self.code = c[len(m[0]):]
            self.pending_param = True
            return Token(T.VAR, "theta")

        # Comments
        if c[0] == "#":
            splits = c.split('\n', maxsplit=1)
            print(splits)
            self.code = c[len(splits[0]):]
            self.position -= 1
            return self.lex()

        # If nothing can be found, raise an exception
        s = c.split(maxsplit=1)
        err = s[0]
        self.code = s[1] if len(s) > 1 else ""

        raise Exception("Lexical error near '{}'".format(err))

    def at_end(self):
        """Check whether the whole input has been read."""
        return not self.code and not self.pending_param

#

__all__ = ["T", "Token", "BitcodeLexer", "UrlLexer", "TextLexer"]