From e90abaaabc040be0e51bfe21d6b376571bc89e15 Mon Sep 17 00:00:00 2001
From: Lephenixnoir <sebastien.michelland@protonmail.com>
Date: Tue, 19 Jul 2022 23:18:01 +0100
Subject: [PATCH] libnum: add tests for how optimized compiled assembly code is

This change adds tests for libnum (run with `make -C build-x tests`)
that compile example programs with g++ and evaluate how optimized the
assembly code is. This is done by checking user-provided specifications
of what instructions should and shouldn't be used against the compiled
assembler.
---
 CMakeLists.txt            |   8 +
 libnum/CMakeLists.txt     |  16 ++
 libnum/test/isel.py       | 474 ++++++++++++++++++++++++++++++++++++++
 libnum/test/isel_num8.cpp |  19 ++
 4 files changed, 517 insertions(+)
 create mode 100644 libnum/test/isel.py
 create mode 100644 libnum/test/isel_num8.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9eda42d..4652086 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@
 
 cmake_minimum_required(VERSION 3.15)
 project(Azur VERSION 0.1 LANGUAGES C CXX ASM)
+include(CTest)
 
 set(CMAKE_INSTALL_MESSAGE LAZY)
 
@@ -150,3 +151,10 @@ add_subdirectory(azur)
 
 # CMake module to find the library
 install(FILES FindAzur.cmake DESTINATION ${MODPATH})
+
+#---
+# Testing
+#---
+
+# Force --verbose on CTest
+add_custom_target(tests COMMAND ${CMAKE_CTEST_COMMAND} --verbose)
diff --git a/libnum/CMakeLists.txt b/libnum/CMakeLists.txt
index 7fb42e5..3cc6c59 100644
--- a/libnum/CMakeLists.txt
+++ b/libnum/CMakeLists.txt
@@ -5,6 +5,8 @@
 #-----------------------------------------------------------------------------#
 # libnum build system
 
+include(CTest)
+
 add_library(num STATIC
   src/static_checks.cpp)
 
@@ -14,3 +16,17 @@ target_include_directories(num PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
 install(TARGETS num DESTINATION ${LIBDIR})
 # Headers: azur/*.h
 install(DIRECTORY include/ DESTINATION ${INCDIR})
+
+#---
+# Testing
+#---
+
+set(TESTS
+  test/isel_num8.cpp)
+
+foreach(testfile IN LISTS TESTS)
+  add_test(NAME "${testfile}"
+    COMMAND python "${CMAKE_CURRENT_SOURCE_DIR}/test/isel.py"
+                   "${CMAKE_CURRENT_SOURCE_DIR}/${testfile}"
+                   -I "${CMAKE_CURRENT_SOURCE_DIR}/include")
+endforeach()
diff --git a/libnum/test/isel.py b/libnum/test/isel.py
new file mode 100644
index 0000000..b3be802
--- /dev/null
+++ b/libnum/test/isel.py
@@ -0,0 +1,474 @@
+#-----------------------------------------------------------------------------#
+#  ,"  /\  ",    Azur: A game engine for CASIO fx-CG and PC                   #
+# |  _/__\_  |   Designed by Lephe' and the Planète Casio community.          #
+#  "._`\/'_."    License: MIT <https://opensource.org/licenses/MIT>           #
+#-----------------------------------------------------------------------------#
+# isel.py: Instruction selection testing facility
+#
+# Many of the primitive numerical functions of libnum are written to compile
+# efficiently on SuperH, where "efficiently" mainly cares about using specific
+# instructions and generating short code. This utility tests these assumptions
+# by compiling C++ files that use the library, and matching the assembler code
+# against simple specifications.
+#
+# The assembler output from g++ is decoded by simple text analysis. Function
+# span from their `_SYMBOL:` to their `.size _SYMBOL, .-_SYMBOL`. Mnemonics are
+# split on whitespace and arguments on commas; directives and unused labels are
+# removed.
+#
+# The assembler output is matched using two types of expressions:
+# - "Program expressions", which run against an entire function and evaluate to
+#   integers. These are used to count, mainly.
+# - "Instruction expressions", which run against single instructions and
+#   evaluate to booleans. These are used to identify instructions.
+#
+# Instruction expressions are based on wildcard patterns matching mnemonics,
+# like `add*` or `b?.s`, combined with logical operators `!`, `&&` and `||`.
+# For instance, `add || addc || addv` identifies addition instructions, while
+# `!mov && !rts` eliminates the usual function boilerplate.
+#
+# Program expressions are based on the count-expression `[e]` where `e` is an
+# instruction expression. `[e]` evaluates to the number of instructions in the
+# function that match `e`. These are combined with integral constants and the
+# usual arithmetic, logical and comparison operators. C-style bool-as-int
+# semantics are used, so tests like `!= 0` or `> 0` can often be omitted. `%`
+# is a shortcut for the number of "non-trivial" instructions, and currently
+# expands to `[!mov && !rts]`.
+#
+# A test is a normal C++ source built using the library, which exposes
+# functions with C linkage (ie. no name mangling) and has specifications in
+# comments of the form:
+#
+#  // FUNCTION_NAME: PROGRAM_EXPRESSION
+#
+# The comments should span entire lines and need not be placed near the
+# functions that they test.
+#---
+
+from dataclasses import dataclass
+import subprocess
+import functools
+import fnmatch
+import typing
+import enum
+import sys
+import os
+import re
+
+#---
+# Program representation and parsing
+#---
+
+@dataclass
+class Insn:
+    """A concrete instruction from a compiled assembly program."""
+    mnemonic: str
+    args: list[str]
+
+#---
+# Specification representation and parsing
+#---
+
+@dataclass
+class InsnPattern:
+    """An abstract pattern that can be matched against asm instructions."""
+
+    # Instruction mnemonic as a wildcard pattern (may use '?' and '*')
+    mnemonicWildcard: str
+    # Pattern for the arguments
+    # args: list
+
+    def evalAtInsn(self, program, i) -> bool:
+        return fnmatch.fnmatchcase(program[i].mnemonic, self.mnemonicWildcard)
+
+    def treeStr(self, indent):
+        return (" "*indent) + self.mnemonicWildcard
+
+@dataclass
+class InsnExpr:
+    """An expression built off `InsnPattern`, which runs on single instructions
+       and evaluates to a boolean."""
+
+    # Node constructor and arguments
+    ctor: str
+    args: list[typing.Union[InsnPattern, "InsnExpr"]]
+
+    def evalAtInsn(self, program, i) -> bool:
+        args = [a.evalAtInsn(program, i) for a in self.args]
+
+        match self.ctor, args:
+            case "mnemonic", [x]: return x
+            # Boolean operators
+            case "!",  [x]:   return 1 if not x else 0
+            case "&&", [x,y]: return 1 if x and y else 0
+            case "||", [x,y]: return 1 if x or y else 0
+
+    def treeStr(self, indent):
+        args = [a.treeStr(indent+2) for a in self.args]
+        return (" "*indent) + self.ctor + ":\n" + "\n".join(args)
+
+@dataclass
+class ProgExpr:
+    """An expression which runs on full programs and evaluates to counts of
+       matched instructions."""
+
+    # Node constructor and arguments
+    ctor: str
+    args: list[typing.Union[InsnExpr, int, "Expr"]]
+
+    def evalArg(self, program, arg):
+        match arg:
+            case int(i):
+                return i
+            case InsnExpr(_, _) as e:
+                return sum(int(e.evalAtInsn(program, i))
+                           for i in range(len(program)))
+            case ProgExpr(_, _) as e:
+                return e.evalAtProg(program)
+
+    def evalAtProg(self, program) -> int:
+        args = [self.evalArg(program, a) for a in self.args]
+
+        match self.ctor, args:
+            case "atom", [x]: return x
+            # Comparisons
+            case "<",  [x,y]: return x < y
+            case ">",  [x,y]: return x > y
+            case "<=", [x,y]: return x <= y
+            case ">=", [x,y]: return x >= y
+            case "=",  [x,y]: return x == y
+            case "!=", [x,y]: return x != y
+            # Arithmetic
+            case "+",  [x,y]: return x + y
+            case "-",  [x,y]: return x - y
+            case "-",  [x]:   return -x
+            # Boolean operators
+            case "!",  [x]:   return 1 if not x else 0
+            case "&&", [x,y]: return 1 if x and y else 0
+            case "||", [x,y]: return 1 if x or y else 0
+
+    def argStr(self, arg, indent):
+        match arg:
+            case int(i):
+                return (" "*indent) + str(i)
+            case InsnExpr(_, _) as e:
+                return e.treeStr(indent)
+            case ProgExpr(_, _) as e:
+                return e.treeStr(indent)
+
+    def treeStr(self, indent):
+        args = [self.argStr(a, indent+2) for a in self.args]
+        return (" "*indent) + self.ctor + ":\n" + "\n".join(args)
+
+T = enum.Enum("T", "NUM OP MNEMONIC LPAR RPAR LBRA RBRA PERCENT END".split())
+
+@dataclass
+class Token:
+    type: T
+    value: typing.Any
+
+    def __str__(self):
+        return f"{self.type}({self.value})"
+
+class ExprLexer:
+    """A lexer for `InsnPattern`, `InsnExpr` and `ProgExpr`."""
+
+    RE_NUM       = re.compile(r"[0-9]+|0[xX][0-9a-fA-F]+")
+    RE_OP        = re.compile(r"!|&&|\|\||<=|>=|<>|<|>|=|!=|\+|-")
+    RE_MNEMONIC  = re.compile(r"[a-zA-Z.*?][a-zA-Z0-9.*?]*")
+    PUNCT = {
+        "(": T.LPAR, ")": T.RPAR,
+        "[": T.LBRA, "]": T.RBRA,
+        "%": T.PERCENT,
+    }
+
+    def __init__(self, code):
+        """Initialize the lexer to start analyzing `code`."""
+        self.sourceCode = code
+        self.rewind()
+
+    def rewind(self):
+        """Start or restart lexing the same input."""
+        self.code = self.sourceCode
+        self.position = 0
+        self.errors = 0
+
+    def atEnd(self):
+        """Check whether the end of the input has been reached."""
+        return len(self.code) == 0
+
+    def dump(self, fp=sys.stdout):
+        """Exhaust lexer input and print the result to the specified stream."""
+        while not self.atEnd():
+            t = self.lex()
+            print(f"{self.position:5d}: {t}", file=fp)
+
+    def lex(self):
+        """Return the next token in the stream."""
+        self.position += 1
+        c = self.code.lstrip(" \t")
+
+        if len(c) == 0:
+            return Token(T.END, None)
+
+        if c[0] in ExprLexer.PUNCT:
+            self.code = c[1:]
+            return Token(ExprLexer.PUNCT[c[0]], None)
+
+        if (m := ExprLexer.RE_NUM.match(c)) is not None:
+            self.code = c[len(m[0]):]
+            return Token(T.NUM, int(m[0], 0))
+
+        if (m := ExprLexer.RE_OP.match(c)) is not None:
+            self.code = c[len(m[0]):]
+            return Token(T.OP, m[0])
+
+        if (m := ExprLexer.RE_MNEMONIC.match(c)) is not None:
+            self.code = c[len(m[0]):]
+            return Token(T.MNEMONIC, m[0])
+
+        # Raise a lexing error
+        s = c.split(maxsplit=1)
+        err = s[0]
+        self.code = s[1] if len(s) > 1 else ""
+        raise Exception(f"Lexing error near '{err}'")
+
+class ExprParser:
+    """An LL(1) recursive descent parser for program expressions."""
+
+    def __init__(self, lexer):
+        """Parse the output of a given lexer."""
+        self.lexer = lexer
+        self.la = None
+
+    def parseProgExpr(self):
+        """Parse the entire input as a ProgExpr."""
+        self.lexer.rewind()
+        self.la = None
+        self.advance()
+        e = self.pexpr()
+
+        if not self.lexer.atEnd():
+            print("Remaining input:")
+            self.lexer.dump()
+            raise Exception("Syntax error: expected end of input")
+        return e
+
+    def advance(self):
+        """Return the next token and update the lookahead."""
+        next = self.la
+        self.la = self.lexer.lex()
+        return next
+
+    def expect(self, types, pred=None, optional=False):
+        """Read the next token, ensuring it is one of the specified types; if
+           `pred` is specified, also tests the predicate. If `optional` is set,
+           returns None in case of mismatch rather than raising an error."""
+
+        if isinstance(types, T):
+            types = [types]
+        if self.la.type in types and (pred is None or pred(self.la)):
+            return self.advance()
+        if optional:
+            return None
+
+        expected = ", ".join(str(t) for t in types)
+        pos = self.lexer.position
+        err = f"Expected one of {expected}, got {self.la} (at token {pos})"
+        if pred is not None:
+            err += " (with predicate)"
+        raise Exception(f"Syntax error: {err}")
+
+    # Rule combinators implementing unary and binary operators with precedence
+
+    def binaryOpsLeft(ctor, ops):
+        def decorate(f):
+            def symbol(self):
+                e = f(self)
+                pred = lambda t: t.value in ops
+                while (op := self.expect(T.OP, pred, True)) is not None:
+                    e = ctor(op.value, [e, f(self)])
+                return e
+            return symbol
+        return decorate
+
+    def binaryOps(ctor, ops, *, rassoc=False):
+        def decorate(f):
+            def symbol(self):
+                lhs = f(self)
+                pred = lambda t: t.value in ops
+                if (op := self.expect(T.OP, pred, True)) is not None:
+                    rhs = symbol(self) if rassoc else f(self)
+                    return ctor(op.value, [lhs, rhs])
+                else:
+                    return lhs
+            return symbol
+        return decorate
+
+    def binaryOpsRight(ctor, ops):
+        return binaryOpsRight(ctor, ops, rassoc=True)
+
+    def unaryOps(ctor, ops, assoc=True):
+        def decorate(f):
+            def symbol(self):
+                if (op := self.expect(T.OP, optional=True,
+                        pred=lambda t: t.value in ops)) is not None:
+                    arg = symbol(self) if assoc else f(self)
+                    return ctor(op.value, [arg])
+                else:
+                    return f(self)
+            return symbol
+        return decorate
+
+    # Parsing rules
+
+    @binaryOpsLeft(ProgExpr, ["||"])
+    @binaryOpsLeft(ProgExpr, ["&&"])
+    @binaryOps(ProgExpr, [">", ">=", "<", "<=", "=", "!="])
+    @binaryOpsLeft(ProgExpr, ["+", "-"])
+    @unaryOps(ProgExpr, ["!", "-"])
+    def pexpr(self):
+        t = self.expect([T.LPAR, T.NUM, T.PERCENT, T.LBRA])
+        if t.type == T.LPAR:
+            pe = self.pexpr()
+            self.expect(T.RPAR)
+            return pe
+        elif t.type == T.NUM:
+            return ProgExpr("atom", [t.value])
+        elif t.type == T.PERCENT:
+            return parseProgExpr("[!rts && !mov]")
+        elif t.type == T.LBRA:
+            ie = self.iexpr()
+            self.expect(T.RBRA)
+            return ProgExpr("atom", [ie])
+
+    @binaryOpsLeft(InsnExpr, ["||"])
+    @binaryOpsLeft(InsnExpr, ["&&"])
+    @unaryOps(InsnExpr, ["!"])
+    def iexpr(self):
+        if self.expect(T.LPAR, optional=True):
+            ie = self.iexpr()
+            self.expect(T.RPAR)
+            return ie
+        else:
+            ip = self.ipat()
+            return InsnExpr("mnemonic", [ip])
+
+    def ipat(self):
+        t = self.expect(T.MNEMONIC)
+        return InsnPattern(t.value)
+
+def parseProgExpr(string):
+    l = ExprLexer(string)
+    p = ExprParser(l)
+    return p.parseProgExpr()
+
+#---
+# Main logic
+#---
+
+def runCompiler(input, flags):
+    p = subprocess.run(
+        ["sh-elf-g++", input, *flags, "-S", "-o", "-", "-std=c++20", "-O2"],
+        stdout=subprocess.PIPE, check=True)
+    return str(p.stdout, "utf8")
+
+def extractFunctions(asm):
+    # Normalize spacing
+    asm = asm.replace("\t", " ")
+    # Split into lines and remove indentation
+    lines = [l.strip() for l in asm.splitlines()]
+    # Remove directives and local symbols
+    lines = [l for l in lines if l and not l.startswith(".")]
+
+    funcs = dict()
+    currentFunc = None
+
+    for l in lines:
+        if l.endswith(":"):
+            currentFunc = l[:-1]
+            funcs[currentFunc] = []
+        elif currentFunc is None:
+            raise Exception(f"instruction '{l}' before symbol name")
+        else:
+            mnemonic, *args = l.split(maxsplit=1)
+            if args != []:
+                args = [a.strip() for a in args[0].split(",")]
+            funcs[currentFunc].append(Insn(mnemonic, args))
+
+    return funcs
+
+def printRawFunction(asm, sybl):
+    # Find symbol definition
+    start = asm.index(sybl + ":")
+    if start < 0:
+        print(f"<Unable to extract function {sybl}>")
+        return False
+
+    # Find function size
+    end = asm.index(f".size\t{sybl}, .-{sybl}", start)
+    if end < 0:
+        print(f"<Unable to extract function {sybl}")
+        return False
+
+    func = asm[start:end].strip() + "\n"
+
+    # Eliminate labels that are defined but unused
+    RE_LABEL = re.compile(r"^(\.[a-zA-Z_][a-zA-Z0-9_]*):$", re.MULTILINE)
+    for label in RE_LABEL.findall(func):
+        if func.count(f"{label}:\n") == 1:
+            func = func.replace(f"{label}:\n", "")
+
+    print(func.strip())
+    return True
+
+def loadTests(input):
+    RE_SPEC = re.compile(r'^//\s*([a-zA-Z_][a-zA-Z0-9_]*):\s*(.+)$')
+
+    with open(input, "r") as fp:
+        code = fp.read()
+
+    tests = dict()
+    for line in code.split("\n"):
+        m = RE_SPEC.match(line)
+        if m is not None:
+            sybl = "_" + m[1]
+            if sybl not in tests:
+                tests[sybl] = []
+            tests[sybl].append((m[2], parseProgExpr(m[2])))
+
+    return tests
+
+if len(sys.argv) < 2:
+    print(f"usage: {sys.argv[0]} <C++ SOURCE> <CXXFLAGS...>", file=sys.stderr)
+
+asm = runCompiler(sys.argv[1], sys.argv[2:])
+funcs = extractFunctions(asm)
+tests = loadTests(sys.argv[1])
+errors = False
+
+for sybl in tests:
+    if sybl not in funcs:
+        print(f"error: no function '{sybl}' found", file=sys.stderr)
+        errors = True
+
+init = True
+for sybl in sorted(tests):
+    if not init:
+        print("")
+    init = False
+    print(f"\x1b[36m{40*'<>'}\x1b[0m")
+
+    if not printRawFunction(asm, sybl):
+        errors = True
+    print("")
+
+    for ref, expr in tests[sybl]:
+        r = expr.evalAtProg(funcs[sybl])
+        if r != 0:
+            print(f"\x1b[32mPASSED\x1b[0m  {ref}")
+        else:
+            print(f"\x1b[31mFAILED\x1b[0m  {ref}")
+            errors = True
+
+sys.exit(1 if errors else 0)
diff --git a/libnum/test/isel_num8.cpp b/libnum/test/isel_num8.cpp
new file mode 100644
index 0000000..7fcc0a0
--- /dev/null
+++ b/libnum/test/isel_num8.cpp
@@ -0,0 +1,19 @@
+#include <num/num.h>
+
+using namespace num;
+
+extern "C" {
+
+// num8_of_num16: %=0
+num8 num8_of_num16(num16 x)
+{
+    return num8(x);
+}
+
+// num8_of_num32: %=1 && [shlr8]
+num8 num8_of_num32(num32 x)
+{
+    return num8(x);
+}
+
+}