Azur/libnum/test/isel.py

#-----------------------------------------------------------------------------#
#  ,"  /\  ",    Azur: A game engine for CASIO fx-CG and PC                   #
# |  _/__\_  |   Designed by Lephe' and the Planète Casio community.          #
#  "._`\/'_."    License: MIT <https://opensource.org/licenses/MIT>           #
#-----------------------------------------------------------------------------#
# isel.py: Instruction selection testing facility
#
# Many of the primitive numerical functions of libnum are written to compile
# efficiently on SuperH, where "efficiently" mainly cares about using specific
# instructions and generating short code. This utility tests these assumptions
# by compiling C++ files that use the library, and matching the assembler code
# against simple specifications.
#
# The assembler output from g++ is decoded by simple text analysis. Function
# span from their `_SYMBOL:` to their `.size _SYMBOL, .-_SYMBOL`. Mnemonics are
# split on whitespace and arguments on commas; directives and unused labels are
# removed.
#
# The assembler output is matched using two types of expressions:
# - "Program expressions", which run against an entire function and evaluate to
#   integers. These are used to count, mainly.
# - "Instruction expressions", which run against single instructions and
#   evaluate to booleans. These are used to identify instructions.
#
# Instruction expressions are based on wildcard patterns matching mnemonics,
# like `add*` or `b?.s`, combined with logical operators `!`, `&&` and `||`.
# For instance, `add || addc || addv` identifies addition instructions, while
# `!mov && !rts` eliminates the usual function boilerplate.
#
# Program expressions are based on the count-expression `[e]` where `e` is an
# instruction expression. `[e]` evaluates to the number of instructions in the
# function that match `e`. These are combined with integral constants and the
# usual arithmetic, logical and comparison operators. C-style bool-as-int
# semantics are used, so tests like `!= 0` or `> 0` can often be omitted. `%`
# is a shortcut for the number of "non-trivial" instructions, and currently
# expands to `[!mov && !rts]`.
#
# A test is a normal C++ source file built using the library, which exposes
# functions with C linkage (ie. no name mangling) and has specifications in
# comments of the form:
#
#  // FUNCTION_NAME: PROGRAM_EXPRESSION
#
# The comments should span entire lines and need not be placed near the
# functions that they test.
#---

from dataclasses import dataclass
import subprocess
import functools
import fnmatch
import typing
import enum
import sys
import os
import re

#---
# Program representation and parsing
#---

@dataclass
class Insn:
    """A concrete instruction from a compiled assembly program."""
    mnemonic: str
    args: list[str]

#---
# Specification representation and parsing
#---

@dataclass
class InsnPattern:
    """An abstract pattern that can be matched against asm instructions."""

    # Instruction mnemonic as a wildcard pattern (may use '?' and '*')
    mnemonicWildcard: str
    # Pattern for the arguments
    # args: list

    def evalAtInsn(self, program, i) -> bool:
        return fnmatch.fnmatchcase(program[i].mnemonic, self.mnemonicWildcard)

    def treeStr(self, indent):
        return (" "*indent) + self.mnemonicWildcard

@dataclass
class InsnExpr:
    """An expression built off `InsnPattern`, which runs on single instructions
       and evaluates to a boolean."""

    # Node constructor and arguments
    ctor: str
    args: list[typing.Union[InsnPattern, "InsnExpr"]]

    def evalAtInsn(self, program, i) -> bool:
        args = [a.evalAtInsn(program, i) for a in self.args]

        match self.ctor, args:
            case "mnemonic", [x]: return x
            # Boolean operators
            case "!",  [x]:   return 1 if not x else 0
            case "&&", [x,y]: return 1 if x and y else 0
            case "||", [x,y]: return 1 if x or y else 0

    def treeStr(self, indent):
        args = [a.treeStr(indent+2) for a in self.args]
        return (" "*indent) + self.ctor + ":\n" + "\n".join(args)

@dataclass
class ProgExpr:
    """An expression which runs on full programs and evaluates to counts of
       matched instructions."""

    # Node constructor and arguments
    ctor: str
    args: list[typing.Union[InsnExpr, int, "Expr"]]

    def evalArg(self, program, arg):
        match arg:
            case int(i):
                return i
            case InsnExpr(_, _) as e:
                return sum(int(e.evalAtInsn(program, i))
                           for i in range(len(program)))
            case ProgExpr(_, _) as e:
                return e.evalAtProg(program)

    def evalAtProg(self, program) -> int:
        args = [self.evalArg(program, a) for a in self.args]

        match self.ctor, args:
            case "atom", [x]: return x
            # Comparisons
            case "<",  [x,y]: return x < y
            case ">",  [x,y]: return x > y
            case "<=", [x,y]: return x <= y
            case ">=", [x,y]: return x >= y
            case "=",  [x,y]: return x == y
            case "!=", [x,y]: return x != y
            # Arithmetic
            case "+",  [x,y]: return x + y
            case "-",  [x,y]: return x - y
            case "-",  [x]:   return -x
            # Boolean operators
            case "!",  [x]:   return 1 if not x else 0
            case "&&", [x,y]: return 1 if x and y else 0
            case "||", [x,y]: return 1 if x or y else 0

    def argStr(self, arg, indent):
        match arg:
            case int(i):
                return (" "*indent) + str(i)
            case InsnExpr(_, _) as e:
                return e.treeStr(indent)
            case ProgExpr(_, _) as e:
                return e.treeStr(indent)

    def treeStr(self, indent):
        args = [self.argStr(a, indent+2) for a in self.args]
        return (" "*indent) + self.ctor + ":\n" + "\n".join(args)

T = enum.Enum("T", "NUM OP MNEMONIC LPAR RPAR LBRA RBRA PERCENT END".split())

@dataclass
class Token:
    type: T
    value: typing.Any

    def __str__(self):
        return f"{self.type}({self.value})"

class ExprLexer:
    """A lexer for `InsnPattern`, `InsnExpr` and `ProgExpr`."""

    RE_NUM       = re.compile(r"[0-9]+|0[xX][0-9a-fA-F]+")
    RE_OP        = re.compile(r"!|&&|\|\||<=|>=|<>|<|>|=|!=|\+|-")
    RE_MNEMONIC  = re.compile(r"[a-zA-Z.*?][a-zA-Z0-9.*?]*")
    PUNCT = {
        "(": T.LPAR, ")": T.RPAR,
        "[": T.LBRA, "]": T.RBRA,
        "%": T.PERCENT,
    }

    def __init__(self, code):
        """Initialize the lexer to start analyzing `code`."""
        self.sourceCode = code
        self.rewind()

    def rewind(self):
        """Start or restart lexing the same input."""
        self.code = self.sourceCode
        self.position = 0
        self.errors = 0

    def atEnd(self):
        """Check whether the end of the input has been reached."""
        return len(self.code) == 0

    def dump(self, fp=sys.stdout):
        """Exhaust lexer input and print the result to the specified stream."""
        while not self.atEnd():
            t = self.lex()
            print(f"{self.position:5d}: {t}", file=fp)

    def lex(self):
        """Return the next token in the stream."""
        self.position += 1
        c = self.code.lstrip(" \t")

        if len(c) == 0:
            return Token(T.END, None)

        if c[0] in ExprLexer.PUNCT:
            self.code = c[1:]
            return Token(ExprLexer.PUNCT[c[0]], None)

        if (m := ExprLexer.RE_NUM.match(c)) is not None:
            self.code = c[len(m[0]):]
            return Token(T.NUM, int(m[0], 0))

        if (m := ExprLexer.RE_OP.match(c)) is not None:
            self.code = c[len(m[0]):]
            return Token(T.OP, m[0])

        if (m := ExprLexer.RE_MNEMONIC.match(c)) is not None:
            self.code = c[len(m[0]):]
            return Token(T.MNEMONIC, m[0])

        # Raise a lexing error
        s = c.split(maxsplit=1)
        err = s[0]
        self.code = s[1] if len(s) > 1 else ""
        raise Exception(f"Lexing error near '{err}'")

class ExprParser:
    """An LL(1) recursive descent parser for program expressions."""

    def __init__(self, lexer):
        """Parse the output of a given lexer."""
        self.lexer = lexer
        self.la = None

    def parseProgExpr(self):
        """Parse the entire input as a ProgExpr."""
        self.lexer.rewind()
        self.la = None
        self.advance()
        e = self.pexpr()

        if not self.lexer.atEnd():
            print("Remaining input:")
            self.lexer.dump()
            raise Exception("Syntax error: expected end of input")
        return e

    def advance(self):
        """Return the next token and update the lookahead."""
        next = self.la
        self.la = self.lexer.lex()
        return next

    def expect(self, types, pred=None, optional=False):
        """Read the next token, ensuring it is one of the specified types; if
           `pred` is specified, also tests the predicate. If `optional` is set,
           returns None in case of mismatch rather than raising an error."""

        if isinstance(types, T):
            types = [types]
        if self.la.type in types and (pred is None or pred(self.la)):
            return self.advance()
        if optional:
            return None

        expected = ", ".join(str(t) for t in types)
        pos = self.lexer.position
        err = f"Expected one of {expected}, got {self.la} (at token {pos})"
        if pred is not None:
            err += " (with predicate)"
        raise Exception(f"Syntax error: {err}")

    # Rule combinators implementing unary and binary operators with precedence

    def binaryOpsLeft(ctor, ops):
        def decorate(f):
            def symbol(self):
                e = f(self)
                pred = lambda t: t.value in ops
                while (op := self.expect(T.OP, pred, True)) is not None:
                    e = ctor(op.value, [e, f(self)])
                return e
            return symbol
        return decorate

    def binaryOps(ctor, ops, *, rassoc=False):
        def decorate(f):
            def symbol(self):
                lhs = f(self)
                pred = lambda t: t.value in ops
                if (op := self.expect(T.OP, pred, True)) is not None:
                    rhs = symbol(self) if rassoc else f(self)
                    return ctor(op.value, [lhs, rhs])
                else:
                    return lhs
            return symbol
        return decorate

    def binaryOpsRight(ctor, ops):
        return binaryOps(ctor, ops, rassoc=True)

    def unaryOps(ctor, ops, assoc=True):
        def decorate(f):
            def symbol(self):
                if (op := self.expect(T.OP, optional=True,
                        pred=lambda t: t.value in ops)) is not None:
                    arg = symbol(self) if assoc else f(self)
                    return ctor(op.value, [arg])
                else:
                    return f(self)
            return symbol
        return decorate

    # Parsing rules

    @binaryOpsLeft(ProgExpr, ["||"])
    @binaryOpsLeft(ProgExpr, ["&&"])
    @binaryOps(ProgExpr, [">", ">=", "<", "<=", "=", "!="])
    @binaryOpsLeft(ProgExpr, ["+", "-"])
    @unaryOps(ProgExpr, ["!", "-"])
    def pexpr(self):
        t = self.expect([T.LPAR, T.NUM, T.PERCENT, T.LBRA])
        if t.type == T.LPAR:
            pe = self.pexpr()
            self.expect(T.RPAR)
            return pe
        elif t.type == T.NUM:
            return ProgExpr("atom", [t.value])
        elif t.type == T.PERCENT:
            return parseProgExpr("[!rts && !mov]")
        elif t.type == T.LBRA:
            ie = self.iexpr()
            self.expect(T.RBRA)
            return ProgExpr("atom", [ie])

    @binaryOpsLeft(InsnExpr, ["||"])
    @binaryOpsLeft(InsnExpr, ["&&"])
    @unaryOps(InsnExpr, ["!"])
    def iexpr(self):
        if self.expect(T.LPAR, optional=True):
            ie = self.iexpr()
            self.expect(T.RPAR)
            return ie
        else:
            ip = self.ipat()
            return InsnExpr("mnemonic", [ip])

    def ipat(self):
        t = self.expect(T.MNEMONIC)
        return InsnPattern(t.value)

def parseProgExpr(string):
    l = ExprLexer(string)
    p = ExprParser(l)
    return p.parseProgExpr()

#---
# Main logic
#---

def runCompiler(input, flags):
    p = subprocess.run(
        ["sh-elf-g++", input, *flags, "-S", "-o", "-", "-std=c++20", "-O2"],
        stdout=subprocess.PIPE, check=True)
    return str(p.stdout, "utf8")

def extractFunctions(asm):
    # Normalize spacing
    asm = asm.replace("\t", " ")
    # Split into lines and remove indentation
    lines = [l.strip() for l in asm.splitlines()]
    # Remove directives and local symbols
    lines = [l for l in lines if l and not l.startswith(".")]

    funcs = dict()
    currentFunc = None

    for l in lines:
        if l.endswith(":"):
            currentFunc = l[:-1]
            funcs[currentFunc] = []
        elif currentFunc is None:
            raise Exception(f"instruction '{l}' before symbol name")
        else:
            mnemonic, *args = l.split(maxsplit=1)
            if args != []:
                args = [a.strip() for a in args[0].split(",")]
            funcs[currentFunc].append(Insn(mnemonic, args))

    return funcs

def printRawFunction(asm, sybl):
    # Find symbol definition
    start = asm.index(sybl + ":")
    if start < 0:
        print(f"<Unable to extract function {sybl}>")
        return False

    # Find function size
    end = asm.index(f".size\t{sybl}, .-{sybl}", start)
    if end < 0:
        print(f"<Unable to extract function {sybl}")
        return False

    func = asm[start:end].strip() + "\n"

    # Eliminate labels that are defined but unused
    RE_LABEL = re.compile(r"^(\.[a-zA-Z_][a-zA-Z0-9_]*):$", re.MULTILINE)
    for label in RE_LABEL.findall(func):
        if func.count(label) == 1:
            func = func.replace(f"{label}:\n", "")

    print(func.strip())
    return True

def loadTests(input):
    RE_SPEC = re.compile(r'^//\s*([a-zA-Z_][a-zA-Z0-9_]*):\s*(.+)$')

    with open(input, "r") as fp:
        code = fp.read()

    tests = dict()
    for line in code.split("\n"):
        m = RE_SPEC.match(line)
        if m is not None:
            sybl = "_" + m[1]
            if sybl not in tests:
                tests[sybl] = []
            tests[sybl].append((m[2], parseProgExpr(m[2])))

    return tests

if len(sys.argv) < 2:
    print(f"usage: {sys.argv[0]} <C++ SOURCE> <CXXFLAGS...>", file=sys.stderr)

asm = runCompiler(sys.argv[1], sys.argv[2:])
funcs = extractFunctions(asm)
tests = loadTests(sys.argv[1])
errors = False

for sybl in tests:
    if sybl not in funcs:
        print(f"error: no function '{sybl}' found", file=sys.stderr)
        errors = True

init = True
for sybl in sorted(tests):
    if not init:
        print("")
    init = False
    print(f"\x1b[36m{40*'<>'}\x1b[0m")

    if not printRawFunction(asm, sybl):
        errors = True
    print("")

    for ref, expr in tests[sybl]:
        r = expr.evalAtProg(funcs[sybl])
        if r != 0:
            print(f"\x1b[32mPASSED\x1b[0m  {ref}")
        else:
            print(f"\x1b[31mFAILED\x1b[0m  {ref}")
            errors = True

sys.exit(1 if errors else 0)