From e94114899659e709c476a8232d2fc0a46a2da6c1 Mon Sep 17 00:00:00 2001 From: Lephenixnoir Date: Sun, 4 Jun 2023 22:53:37 +0200 Subject: [PATCH] perf/cpu: add tests for mul.l pipeline delays --- src/perf/cpu.S | 26 ++++++++++++++++++++++++++ src/perf/cpu.c | 3 +++ 2 files changed, 29 insertions(+) diff --git a/src/perf/cpu.S b/src/perf/cpu.S index 903709a..0bc5307 100644 --- a/src/perf/cpu.S +++ b/src/perf/cpu.S @@ -349,6 +349,32 @@ bench raw_LS_LS_addr, 1024 2: mov.l @r5, r6 end +/* [Multiplication] + + This section investigates pipeline delays in the multiplier. */ + +/* mul.l occupies the multiplier for 2 cycles -> 2 cycles /i */ +bench mul_single_32, 1024 +1: mul.l r4, r5 +2: nop +end + +/* The computed value can be retrieved on cycle #2 -> 2 cycles /i */ +bench mul_single_32_sts, 1024 +1: mul.l r4, r5 +2: sts macl, r0 +end + +/* However it takes an incredibly long time to actually arrive, requiring 2 + tempo cycles, even more than a memory load! -> 5 cycles /i */ +bench mul_single_32_sts_EX, 1024 +1: mul.l r4, r5 + nop + + sts macl, r0 +2: add #1, r0 +end + /* [Branching] In this section, we investigate the cost of conditional execution and diff --git a/src/perf/cpu.c b/src/perf/cpu.c index f72e5b1..1015e0d 100644 --- a/src/perf/cpu.c +++ b/src/perf/cpu.c @@ -46,6 +46,9 @@ MACRO(raw_EX_LS_addr, 1024, "RAW on address: EX/LS") \ MACRO(raw_EX_LS_index, 1024, "RAW on index: EX/LS") \ MACRO(raw_LS_LS_addr, 1024, "RAW on address: LS/LS") \ + MACRO(mul_single_32, 1024, "Pipeline: mul.l/mul.l") \ + MACRO(mul_single_32_sts, 1024, "Pipeline: mul.l/sts") \ + MACRO(mul_single_32_sts_EX, 1024, "Pipeline: mul.l/sts/EX") \ MACRO(branch_bra, 1024, "Branching: bra") \ MACRO(branch_bra_cpuloop, 1024, "Branching: bra (CPU loop)") \ MACRO(darken_1, 512, "Darken: 32-bit #1") \