From e94114899659e709c476a8232d2fc0a46a2da6c1 Mon Sep 17 00:00:00 2001
From: Lephenixnoir <sebastien.michelland@protonmail.com>
Date: Sun, 4 Jun 2023 22:53:37 +0200
Subject: [PATCH] perf/cpu: add tests for mul.l pipeline delays

---
 src/perf/cpu.S | 26 ++++++++++++++++++++++++++
 src/perf/cpu.c |  3 +++
 2 files changed, 29 insertions(+)

diff --git a/src/perf/cpu.S b/src/perf/cpu.S
index 903709a..0bc5307 100644
--- a/src/perf/cpu.S
+++ b/src/perf/cpu.S
@@ -349,6 +349,32 @@ bench raw_LS_LS_addr, 1024
 2:	mov.l	@r5, r6
 end
 
+/* [Multiplication]
+
+   This section investigates pipeline delays in the multiplier. */
+
+/* mul.l occupies the multiplier for 2 cycles -> 2 cycles /i */
+bench mul_single_32, 1024
+1:	mul.l	r4, r5
+2:	nop
+end
+
+/* The computed value can be retrieved on cycle #2 -> 2 cycles /i */
+bench mul_single_32_sts, 1024
+1:	mul.l	r4, r5
+2:	sts	macl, r0
+end
+
+/* However it takes an incredibly long time to actually arrive, requiring 2
+   tempo cycles, even more than a memory load! -> 5 cycles /i */
+bench mul_single_32_sts_EX, 1024
+1:	mul.l	r4, r5
+	nop
+
+	sts	macl, r0
+2:	add	#1, r0
+end
+
 /* [Branching]
 
    In this section, we investigate the cost of conditional execution and
diff --git a/src/perf/cpu.c b/src/perf/cpu.c
index f72e5b1..1015e0d 100644
--- a/src/perf/cpu.c
+++ b/src/perf/cpu.c
@@ -46,6 +46,9 @@
 	MACRO(raw_EX_LS_addr,		1024,	"RAW on address: EX/LS") \
 	MACRO(raw_EX_LS_index,		1024,	"RAW on index: EX/LS") \
 	MACRO(raw_LS_LS_addr,		1024,	"RAW on address: LS/LS") \
+	MACRO(mul_single_32,		1024,	"Pipeline: mul.l/mul.l") \
+	MACRO(mul_single_32_sts,	1024,	"Pipeline: mul.l/sts") \
+	MACRO(mul_single_32_sts_EX,	1024,	"Pipeline: mul.l/sts/EX") \
 	MACRO(branch_bra,			1024,	"Branching: bra") \
 	MACRO(branch_bra_cpuloop,	1024,	"Branching: bra (CPU loop)") \
 	MACRO(darken_1,				512,	"Darken: 32-bit #1") \