From b20731c8290a78c0e270c2e2b393b465538a17b8 Mon Sep 17 00:00:00 2001 From: Lephenixnoir Date: Sun, 29 Dec 2019 19:17:33 +0100 Subject: [PATCH] basic setup for running disassembly passes --- data/sh3.txt | 225 ------------------------------- data/sh4.txt | 26 ---- fxos/main.cpp | 90 ++++++++----- include/fxos/disasm-passes/cfg.h | 29 ++++ include/fxos/disassembly.h | 13 +- include/fxos/semantics.h | 7 - lib/disassembly.cpp | 42 +++++- lib/passes/print.cpp | 2 +- 8 files changed, 131 insertions(+), 303 deletions(-) delete mode 100644 data/sh3.txt delete mode 100644 data/sh4.txt create mode 100644 include/fxos/disasm-passes/cfg.h diff --git a/data/sh3.txt b/data/sh3.txt deleted file mode 100644 index 228c578..0000000 --- a/data/sh3.txt +++ /dev/null @@ -1,225 +0,0 @@ -type: assembly -name: sh-3 ---- - -# Format: [01nmdi]{16}, followed by the mnemonic and the list of arguments. -# In each opcode, there should be at most one sequence of "m", "n", "d" and "i" -# each (representing the location of the argument). -# -# Possible argument strings are predefined and include: -# rn rm #imm -# jump8 jump12 disp pcdisp -# @rn @rm @rn+ @rm+ @-rn -# @(disp,rn) @(disp,rm) @(r0,rn) @(r0,rm) @(disp,gbr) -# -# The disassembler substitutes some elements as follows: -# rn -> value of the "n"-sequence -# rm -> value of the "m"-sequence -# #imm -> value of the "i"-sequence -# disp -> value of the "d"-sequence -# jump8 -> value of the 8-bit "d"-sequence x2 plus value of PC -# jump12 -> value of the 12-bit "d"-sequence x2 plus value of PC -# @(disp,pc) -> value of the 8-bit "d"-sequence x2 or x4, plus value of PC -# TODO: This list does not exactly reflect the behavior of the parser - -0000000001001000 clrs -0000000000001000 clrt -0000000000101000 clrmac -0000000000011001 div0u -0000000000111000 ldtlb -0000000000001001 nop -0000000000101011 rte -0000000000001011 rts -0000000001011000 sets -0000000000011000 sett -0000000000011011 sleep - -0100nnnn00010101 cmp/pl rn -0100nnnn00010001 cmp/pz rn -0100nnnn00010000 dt rn -0000nnnn00101001 movt rn -0100nnnn00000100 rotl rn -0100nnnn00000101 rotr rn -0100nnnn00100100 rotcl rn -0100nnnn00100101 rotcr rn -0100nnnn00100000 shal rn -0100nnnn00100001 shar rn -0100nnnn00000000 shll rn -0100nnnn00000001 shlr rn -0100nnnn00001000 shll2 rn -0100nnnn00001001 shlr2 rn -0100nnnn00011000 shll8 rn -0100nnnn00011001 shlr8 rn -0100nnnn00101000 shll16 rn -0100nnnn00101001 shlr16 rn - -0011nnnnmmmm1100 add rm, rn -0011nnnnmmmm1110 addc rm, rn -0011nnnnmmmm1111 addv rm, rn -0010nnnnmmmm1001 and rm, rn -0011nnnnmmmm0000 cmp/eq rm, rn -0011nnnnmmmm0010 cmp/hs rm, rn -0011nnnnmmmm0011 cmp/ge rm, rn -0011nnnnmmmm0110 cmp/hi rm, rn -0011nnnnmmmm0111 cmp/gt rm, rn -0010nnnnmmmm1100 cmp/str rm, rn -0011nnnnmmmm0100 div1 rm, rn -0010nnnnmmmm0111 div0s rm, rn -0011nnnnmmmm1101 dmuls.l rm, rn -0011nnnnmmmm0101 dmulu.l rm, rn -0110nnnnmmmm1110 exts.b rm, rn -0110nnnnmmmm1111 exts.w rm, rn -0110nnnnmmmm1100 extu.b rm, rn -0110nnnnmmmm1101 extu.w rm, rn -0110nnnnmmmm0011 mov rm, rn -0000nnnnmmmm0111 mul.l rm, rn -0010nnnnmmmm1111 muls.w rm, rn -0010nnnnmmmm1110 mulu.w rm, rn -0110nnnnmmmm1011 neg rm, rn -0110nnnnmmmm1010 negc rm, rn -0110nnnnmmmm0111 not rm, rn -0010nnnnmmmm1011 or rm, rn -0100nnnnmmmm1100 shad rm, rn -0100nnnnmmmm1101 shld rm, rn -0011nnnnmmmm1000 sub rm, rn -0011nnnnmmmm1010 subc rm, rn -0011nnnnmmmm1011 subv rm, rn -0110nnnnmmmm1000 swap.b rm, rn -0110nnnnmmmm1001 swap.w rm, rn -0010nnnnmmmm1000 tst rm, rn -0010nnnnmmmm1010 xor rm, rn -0010nnnnmmmm1101 xtrct rm, rn - -0100mmmm00001110 ldc rm, sr -0100mmmm00011110 ldc rm, gbr -0100mmmm00101110 ldc rm, vbr -0100mmmm00111110 ldc rm, ssr -0100mmmm01001110 ldc rm, spc -0100mmmm10001110 ldc rm, r0_bank -0100mmmm10011110 ldc rm, r1_bank -0100mmmm10101110 ldc rm, r2_bank -0100mmmm10111110 ldc rm, r3_bank -0100mmmm11001110 ldc rm, r4_bank -0100mmmm11011110 ldc rm, r5_bank -0100mmmm11101110 ldc rm, r6_bank -0100mmmm11111110 ldc rm, r7_bank -0100mmmm00001010 lds rm, mach -0100mmmm00011010 lds rm, macl -0100mmmm00101010 lds rm, pr -0000nnnn00000010 stc sr, rn -0000nnnn00010010 stc gbr, rn -0000nnnn00100010 stc vbr, rn -0000nnnn00110010 stc ssr, rn -0000nnnn01000010 stc spc, rn -0000nnnn10000010 stc r0_bank, rn -0000nnnn10010010 stc r1_bank, rn -0000nnnn10100010 stc r2_bank, rn -0000nnnn10110010 stc r3_bank, rn -0000nnnn11000010 stc r4_bank, rn -0000nnnn11010010 stc r5_bank, rn -0000nnnn11100010 stc r6_bank, rn -0000nnnn11110010 stc r7_bank, rn -0000nnnn00001010 sts mach, rn -0000nnnn00011010 sts macl, rn -0000nnnn00101010 sts pr, rn - -0100nnnn00101011 jmp @rn -0100nnnn00001011 jsr @rn -0000nnnn10000011 pref @rn -0100nnnn00011011 tas.b @rn -0010nnnnmmmm0000 mov.b rm, @rn -0010nnnnmmmm0001 mov.w rm, @rn -0010nnnnmmmm0010 mov.l rm, @rn -0110nnnnmmmm0000 mov.b @rm, rn -0110nnnnmmmm0001 mov.w @rm, rn -0110nnnnmmmm0010 mov.l @rm, rn -0000nnnnmmmm1111 mac.l @rm+, @rn+ -0100nnnnmmmm1111 mac.w @rm+, @rn+ - -0110nnnnmmmm0100 mov.b @rm+, rn -0110nnnnmmmm0101 mov.w @rm+, rn -0110nnnnmmmm0110 mov.l @rm+, rn - -0100mmmm00000111 ldc.l @rm+, sr -0100mmmm00010111 ldc.l @rm+, gbr -0100mmmm00100111 ldc.l @rm+, vbr -0100mmmm00110111 ldc.l @rm+, ssr -0100mmmm01000111 ldc.l @rm+, spc -0100mmmm10000111 ldc.l @rm+, r0_bank -0100mmmm10010111 ldc.l @rm+, r1_bank -0100mmmm10100111 ldc.l @rm+, r2_bank -0100mmmm10110111 ldc.l @rm+, r3_bank -0100mmmm11000111 ldc.l @rm+, r4_bank -0100mmmm11010111 ldc.l @rm+, r5_bank -0100mmmm11100111 ldc.l @rm+, r6_bank -0100mmmm11110111 ldc.l @rm+, r7_bank -0100mmmm00000110 lds.l @rm+, mach -0100mmmm00010110 lds.l @rm+, macl -0100mmmm00100110 lds.l @rm+, pr - -0010nnnnmmmm0100 mov.b rm, @-rn -0010nnnnmmmm0101 mov.w rm, @-rn -0010nnnnmmmm0110 mov.l rm, @-rn - -0100nnnn00000011 stc.l sr, @-rn -0100nnnn00010011 stc.l gbr, @-rn -0100nnnn00100011 stc.l vbr, @-rn -0100nnnn00110011 stc.l ssr, @-rn -0100nnnn01000011 stc.l spc, @-rn -0100nnnn10000011 stc.l r0_bank, @-rn -0100nnnn10010011 stc.l r1_bank, @-rn -0100nnnn10100011 stc.l r2_bank, @-rn -0100nnnn10110011 stc.l r3_bank, @-rn -0100nnnn11000011 stc.l r4_bank, @-rn -0100nnnn11010011 stc.l r5_bank, @-rn -0100nnnn11100011 stc.l r6_bank, @-rn -0100nnnn11110011 stc.l r7_bank, @-rn -0100nnnn00000010 sts.l mach, @-rn -0100nnnn00010010 sts.l macl, @-rn -0100nnnn00100010 sts.l pr, @-rn - -10000000nnnndddd mov.b r0, @(disp,rn) -10000001nnnndddd mov.w r0, @(disp,rn) -0001nnnnmmmmdddd mov.l rm, @(disp,rn) -10000100mmmmdddd mov.b @(disp,rm), r0 -10000101mmmmdddd mov.w @(disp,rm), r0 -0101nnnnmmmmdddd mov.l @(disp,rm), rn -0000nnnnmmmm0100 mov.b rm, @(r0,rn) -0000nnnnmmmm0101 mov.w rm, @(r0,rn) -0000nnnnmmmm0110 mov.l rm, @(r0,rn) -0000nnnnmmmm1100 mov.b @(r0,rm), rn -0000nnnnmmmm1101 mov.w @(r0,rm), rn -0000nnnnmmmm1110 mov.l @(r0,rm), rn -11000000dddddddd mov.b r0, @(disp,gbr) -11000001dddddddd mov.w r0, @(disp,gbr) -11000010dddddddd mov.l r0, @(disp,gbr) -11000100dddddddd mov.b @(disp,gbr), r0 -11000101dddddddd mov.w @(disp,gbr), r0 -11000110dddddddd mov.l @(disp,gbr), r0 - -11001101iiiiiiii and.b #imm, @(r0,gbr) -11001111iiiiiiii or.b #imm, @(r0,gbr) -11001100iiiiiiii tst.b #imm, @(r0,gbr) -11001110iiiiiiii xor.b #imm, @(r0,gbr) - -1001nnnndddddddd mov.w @(disp,pc), rn -1101nnnndddddddd mov.l @(disp,pc), rn -11000111dddddddd mova.l @(disp,pc), r0 - -0000mmmm00100011 braf rm -0000mmmm00000011 bsrf rm -10001011dddddddd bf jump8 -10001111dddddddd bf.s jump8 -10001001dddddddd bt jump8 -10001101dddddddd bt.s jump8 -1010dddddddddddd bra jump12 -1011dddddddddddd bsr jump12 - -0111nnnniiiiiiii add #imm, rn -11001001iiiiiiii and #imm, r0 -10001000iiiiiiii cmp/eq #imm, r0 -1110nnnniiiiiiii mov #imm, rn -11001011iiiiiiii or #imm, r0 -11001000iiiiiiii tst #imm, r0 -11001010iiiiiiii xor #imm, r0 -11000011iiiiiiii trapa #imm diff --git a/data/sh4.txt b/data/sh4.txt deleted file mode 100644 index 09f076a..0000000 --- a/data/sh4.txt +++ /dev/null @@ -1,26 +0,0 @@ -type: assembly -name: sh-4a-extensions ---- - -0000nnnn01110011 movco.l r0, @rn -0000mmmm01100011 movli.l @rm, r0 -0100mmmm10101001 movua.l @rm, r0 -0100mmmm11101001 movua.l @rm+, r0 -0000nnnn11000011 movca.l r0, @rn - -0000nnnn11100011 icbi @rn -0000nnnn10010011 ocbi @rn -0000nnnn10100011 ocbp @rn -0000nnnn10110011 ocbwb @rn - -0000nnnn11010011 prefi @rn -0000000010101011 synco - -0100mmmm00111010 ldc rm, sgr -0100mmmm11111010 ldc rm, dbr -0100mmmm00110110 ldc.l @rm+, sgr -0100mmmm11110110 ldc.l @rm+, dbr -0000nnnn00111010 stc sgr, rn -0000nnnn11111010 stc dbr, rn -0100nnnn00110010 stc.l sgr, @-rn -0100nnnn11110010 stc.l dbr, @-rn diff --git a/fxos/main.cpp b/fxos/main.cpp index 4d5978b..20006e4 100644 --- a/fxos/main.cpp +++ b/fxos/main.cpp @@ -1,8 +1,13 @@ #include "fxos-cli.h" -#include + +#include #include #include +#include #include + +#include + #include #include @@ -16,42 +21,43 @@ namespace fs = std::filesystem; using namespace FxOS; static char const *help_string = R"( -usage: fxos info - fxos disasm [options...] - fxos disasm -b [options...] +usage: fxos info + fxos disasm [options...] fxos analyze [-f] [-s] [-a] [-r] [options...] fxos is a reverse-engineering tool that disassembles and analyzes SuperH programs and OS dumps for fx9860g and fxcg50-like CASIO calculators, using an -editable database of platforms, syscalls, and OS knowledge. - -Commands: - info Identify an OS image: version, platform, date, checksums... - disasm Disassemble and annotate code with relative address targets, - syscall invocations, control flow, constant propagation and hints - about memory structure. - analyze Dig an address or syscall number, finding syscall references, - 4-aligned occurrences, memory region and probable role. +editable database of platform, syscall, and OS knowledge. General options: - -b Work with an arbitrary binary file, not an OS -3, --sh3 Assume SH3 OS and platform (default: SH4) -4, --sh4 Assume SH4 OS and platform (default: SH4) -Database extensions: - --load Read documentation from - --load Read documentation recursively from +A is either: + A target in library (eg "fx@3.10") + -f An arbitrary file which is loaded as ROM -Disassembly file selection: - Disassemble this target from the database (eg. "fx@3.10") - -f Disassemble this file as standalone ROM +INFO COMMAND -Disassembly options: +Identify an OS image: version, platform, date, checksums... + +DISASM COMMAND + +Disassemble and annotate code with relative address targets, syscalls, control +flow, propagated constants and hints about memory structure. + +Location specifiers:
Start disassembling at this address (hexa)
: Disassemble exactly the specified region. is an - hexadecimal number optionnally followed by k, M, or G. + hexadecimal number optionally followed by k, M, or G. % Start disassembling at this syscall's address (hexa) + Disassemble this library symbol (typically a syscall name). + Note that
takes precedence if ambiguous. + +Disassembly options: -p Execute the specified comma-separated list of passes + --load Read additional documentation from + --load Read additional documentation recursively from Available passes: cfg Build the control flow graph (always required) @@ -61,8 +67,13 @@ Available passes: regs Annotate code with peripheral register addresses The default sequence of passes is cfg,pcrel,cstprop,syscall,regs. When -disassembling a function (ie. no size specified on the command-line), the pcrel -and cfg passes are always executed to explore the function. +disassembling a function (ie. no size specified on the command-line), the cfg +pass is always executed to explore the function. + +ANALYZE COMMAND + +Dig an address or syscall number. Finds syscall references, 4-aligned +occurrences, memory region... Analysis modes: -f, --full Run all analysis passes on (same as -sar) @@ -312,16 +323,6 @@ int main_disassembly(int argc, char **argv) } if(error) return 1; -/* try { - FxOS::load("data/sh3.txt"); - if(mpu == '4') FxOS::load("data/sh4.txt"); - } - catch(FxOS::SyntaxError &e) { - std::cerr << e.file() << ":" << e.line() << ": " << - e.what() << "\n" << std::flush; - return 1; - } */ - if(!file.size()) { std::string targetname = argv[optind + 1]; @@ -333,11 +334,26 @@ int main_disassembly(int argc, char **argv) return 1; } - Target t(targets[targetname], library); + Target target(targets[targetname], library); - char const *ref = argv[optind + 2]; + char const *refstr = argv[optind + 2]; + uint32_t ref; + sscanf(refstr, "%x", &ref); - std::cout << "disassembling target:" << targetname << " ref:" << ref << "\n"; + Disassembly disasm(target); + OS *os = nullptr; + + std::cout << "disassembling target:" << targetname << " ref:" << refstr << "\n"; + + for(auto pass: passes) + { + std::cout << "running pass: " << pass << "\n"; + if(pass == "cfg") + { + CfgPass p(disasm); + p.run(ref); + } + } } else { diff --git a/include/fxos/disasm-passes/cfg.h b/include/fxos/disasm-passes/cfg.h new file mode 100644 index 0000000..b03a51f --- /dev/null +++ b/include/fxos/disasm-passes/cfg.h @@ -0,0 +1,29 @@ +//--- +// fxos.disasm-passes.cfg: Control Flow Graph construction +// +// This pass explores functions and computes the [jmptarget] field of concrete +// instructions as it goes. This is required for other passes that work by +// traversing the CFG, such as the abstract interpretor. +// +// This is the main exploration pass. Other passes do not typically load new +// instructions from the underlying disassembly. Straightforward passes such as +// [print] iterate on instructions loaded by this pass. +//--- + +#ifndef LIBFXOS_DISASM_PASSES_CFG_H +#define LIBFXOS_DISASM_PASSES_CFG_H + +#include + +namespace FxOS { + +class CfgPass: public DisassemblyPass +{ +public: + CfgPass(Disassembly &disasm); + void analyze(uint32_t pc, ConcreteInstruction &inst) override; +}; + +} /* namespace FxOS */ + +#endif /* LIBFXOS_DISASM_PASSES_CFG_H */ diff --git a/include/fxos/disassembly.h b/include/fxos/disassembly.h index c33650c..c8732d9 100644 --- a/include/fxos/disassembly.h +++ b/include/fxos/disassembly.h @@ -102,6 +102,12 @@ class Disassembly public: Disassembly(Target &target); + /* Check whether an instruction has been visited so far */ + bool hasins(uint32_t pc); + /* Get the minimum and maximum loaded instruction addresses */ + uint32_t minpc(); + uint32_t maxpc(); + /* Get the storage to any concrete instruction. The instruction will be loaded and initialized if it had not been read before. */ ConcreteInstruction &readins(uint32_t pc); @@ -133,6 +139,8 @@ public: protected: /* Add an instruction to the queue to analyze next */ void enqueue(uint32_t pc); + /* Add the next loaded instruction in address space */ + void enqueue_next(uint32_t pc); /* Enqueue the unseen successors of this instruction */ void enqueue_unseen_successors(uint32_t pc, ConcreteInstruction &inst); /* Enqueue all the success of this instruction */ @@ -150,11 +158,6 @@ private: std::set m_seen; }; -class CfgPass: public DisassemblyPass -{ - CfgPass(Disassembly &disasm); - void analyze(uint32_t pc, ConcreteInstruction &inst) override; -}; class PcrelPass: public DisassemblyPass { diff --git a/include/fxos/semantics.h b/include/fxos/semantics.h index 855cece..0d058b1 100644 --- a/include/fxos/semantics.h +++ b/include/fxos/semantics.h @@ -24,13 +24,6 @@ namespace FxOS { class DataType { public: - /* Copy constructor */ - DataType(DataType const &other); - DataType & operator = (DataType other); - - /* Destructor that takes into account the non-trivial union */ - ~DataType(); - enum DataKind { /* Base types */ Integral, diff --git a/lib/disassembly.cpp b/lib/disassembly.cpp index 6c03cf6..d5e5259 100644 --- a/lib/disassembly.cpp +++ b/lib/disassembly.cpp @@ -45,9 +45,38 @@ Disassembly::Disassembly(Target &target): { } +bool Disassembly::hasins(uint32_t pc) +{ + return m_instructions.count(pc) > 0; +} + +uint32_t Disassembly::minpc() +{ + uint32_t min = 0xffffffff; + + for(auto &it: m_instructions) + { + if(it.first < min) min = it.first; + } + + return min; +} + +uint32_t Disassembly::maxpc() +{ + uint32_t max = 0x00000000; + + for(auto &it: m_instructions) + { + if(it.first > max) max = it.first; + } + + return max; +} + ConcreteInstruction &Disassembly::readins(uint32_t pc) { - if(pc & 1) throw std::runtime_error("Disassembly::readins at odd PC"); + if(pc & 1) throw std::runtime_error("Disassembly::ins_read at odd PC"); try { @@ -64,7 +93,7 @@ ConcreteInstruction &Disassembly::readins(uint32_t pc) Instruction &inst = *insmap[opcode]; ConcreteInstruction ci(inst); - m_instructions.emplace(std::make_pair(pc, ci)); + m_instructions.emplace(pc, ci); return m_instructions.at(pc); } } @@ -86,6 +115,15 @@ void DisassemblyPass::enqueue(uint32_t pc) m_queue.push(pc); } +void DisassemblyPass::enqueue_next(uint32_t pc) +{ + /* TODO: DisassemblyPass::enqueue_next is inefficient */ + do pc += 2; + while(!m_disasm.hasins(pc)); + + enqueue(pc); +} + void DisassemblyPass::enqueue_unseen_successors(uint32_t pc, ConcreteInstruction &inst) { diff --git a/lib/passes/print.cpp b/lib/passes/print.cpp index 6e93413..b377219 100644 --- a/lib/passes/print.cpp +++ b/lib/passes/print.cpp @@ -14,7 +14,7 @@ PrintPass::PrintPass(Disassembly &disasm): void PrintPass::analyze(uint32_t pc, ConcreteInstruction &ci) { std::cout << ci.inst.mnemonic << "\n"; - enqueue_unseen_successors(pc, ci); + enqueue_next(pc); } } /* namespace FxOS */