perf/cpucache: evidence the 32-kiB operand cache

2021-06-21 09:39:36 +02:00 · 2021-06-21 09:39:36 +02:00 · 81adef2785
parent ebc9b5c1c2
commit 81adef2785
4 changed files with 276 additions and 16 deletions
--- a/src/perf/cpu.S
+++ b/src/perf/cpu.S
@ -0,0 +1,128 @@
+# We put all the code in ILRAM to avoid measurement variations caused by code
+# being fetched from ROM. The ILRAM is ideal for this task because successive
+# instruction accesses take only 1 cycle (assuming no interference, which there
+# is none).
+.section .ilram
+
+# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
+# prologue has an even number of instructions, which results in the loop code
+# being 4-aligned, which is of extreme importance.
+#define PROLOGUE(COUNT) 		\
+	mov	#(COUNT/256), r0 ;	\
+	shll8	r0 ;			\
+	ldrs	1f ;			\
+	ldre	2f ;			\
+	ldrc	r0 ;			\
+	nop
+
+# Test epilogue
+#define EPILOGUE()			\
+	rts ;				\
+	nop
+
+
+/* [Baseline]
+
+   In this first section, we want to establish an approximate cost of the
+   setup, which consists of TMU access for libprof, function calls, and the
+   loop setup for the DSP. */
+
+.global _perf_cpu_empty
+
+# Empty setup (0 iterations), as baseline
+.align 4
+_perf_cpu_empty:
+	PROLOGUE(0)
+1: 2:	EPILOGUE()
+
+
+/* [Loop control]
+
+   In this section, we want to check whether the DSP repeat system has any
+   added cost per-loop, and we do this by executing the same instructions with
+   a varying number of DSP repeat jumps.
+
+   The DSP jump has no additional cost, which makes testing much simpler by
+   avoiding loop unrolls (that would otherwise be needed to amortize the cost
+   of the jump). In addition, this allows for tighter loops in real-world
+   programs, and tigher code first better in cache. */
+
+.global _perf_cpu_nop_2048x1
+.global _perf_cpu_nop_1024x2
+.global _perf_cpu_nop_512x4
+.global _perf_cpu_nop_256x8
+
+# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
+# Parallel execution likely cannot read through the DSP jump for architectural
+# reasons; my guess is that the next instruction isn't fetched yet.
+.align 4
+_perf_cpu_nop_2048x1:
+	PROLOGUE(2048)
+1: 2: 	nop
+	EPILOGUE()
+
+# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
+.align 4
+_perf_cpu_nop_1024x2:
+	PROLOGUE(1024)
+1:	nop
+2:	nop
+	EPILOGUE()
+
+# nop loop (512 iterations of 4 nop) -> 2 cycles /i
+.align 4
+_perf_cpu_nop_512x4:
+	PROLOGUE(512)
+1:	nop
+	nop
+	nop
+2:	nop
+	EPILOGUE()
+
+# nop loop (256 iterations of 8 nop) -> 4 cycles/i
+.align 4
+_perf_cpu_nop_256x8:
+	PROLOGUE(256)
+1:	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+2:	nop
+	EPILOGUE()
+
+/* [Parallel execution]
+
+   In this section, we reproduce simple cases of superscalar parallelism for
+   instructions of different types, using only instructions that have trivial
+   pipelines with no extra cycles. */
+
+.global _perf_cpu_EX_EX
+.global _perf_cpu_MT_MT
+.global _perf_cpu_LS_LS
+
+# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
+.align 4
+_perf_cpu_EX_EX:
+	PROLOGUE(1024)
+1:	add	#0, r0
+2:	add	#0, r1
+	EPILOGUE()
+
+# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
+.align 4
+_perf_cpu_MT_MT:
+	PROLOGUE(1024)
+1:	mov	r0, r1
+2:	mov	r2, r3
+	EPILOGUE()
+
+# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
+.align 4
+_perf_cpu_LS_LS:
+	PROLOGUE(1024)
+1:	mov.l	@r15, r0
+2:	mov.l	@r15, r1
+	EPILOGUE()
--- a/src/perf/cpu.c
+++ b/src/perf/cpu.c
@ -0,0 +1,122 @@
+#include <gint/display.h>
+#include <gint/keyboard.h>
+#include <gint/clock.h>
+
+#include <gintctl/perf.h>
+#include <gintctl/util.h>
+
+#include <libprof.h>
+
+/* Baseline */
+void perf_cpu_empty(void);
+/* Loop control */
+void perf_cpu_nop_2048x1(void);
+void perf_cpu_nop_1024x2(void);
+void perf_cpu_nop_512x4(void);
+void perf_cpu_nop_256x8(void);
+/* Parallel execution */
+void perf_cpu_EX_EX(void);
+void perf_cpu_MT_MT(void);
+void perf_cpu_LS_LS(void);
+
+/* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
+static uint32_t baseline_ticks = 0;
+
+/* Number of CPU cycles spent executing a function */
+uint32_t Iphi_cycles(void (*function)(void))
+{
+	prof_t perf = prof_make();
+
+	prof_enter(perf);
+	(*function)();
+	prof_leave(perf);
+
+	clock_frequency_t const *freq = clock_freq();
+	uint32_t TMU_cycles = perf.elapsed - baseline_ticks;
+	uint32_t PLL_cycles = (TMU_cycles * 4) * freq->Pphi_div;
+	return PLL_cycles / freq->Iphi_div;
+}
+
+/* Number of CPU cycles per iteration; the number of iterations must obviously
+   match assembler code for that test */
+float Iphi_per_iteration(void (*function)(void), int count)
+{
+	return (float)Iphi_cycles(function) / count;
+}
+
+/* Number of TMU cycles for an empty function */
+uint32_t TMU_baseline(void)
+{
+	prof_t perf = prof_make();
+
+	for(int i = 0; i < 16; i++)
+	{
+		prof_enter(perf);
+		perf_cpu_empty();
+		prof_leave(perf);
+	}
+
+	return perf.elapsed / 16;
+}
+
+//---
+
+void gintctl_perf_cpu(void)
+{
+	int key = 0;
+
+	/* Measure baseline time */
+	baseline_ticks = TMU_baseline();
+
+	uint32_t Iphi_cpu_nop_2048x1 = 0;
+	uint32_t Iphi_cpu_nop_1024x2 = 0;
+	uint32_t Iphi_cpu_nop_512x4 = 0;
+	uint32_t Iphi_cpu_nop_256x8 = 0;
+
+	uint32_t Iphi_cpu_EX_EX = 0;
+	uint32_t Iphi_cpu_MT_MT = 0;
+	uint32_t Iphi_cpu_LS_LS = 0;
+
+	while(key != KEY_EXIT)
+	{
+		dclear(C_WHITE);
+
+		#ifdef FXCG50
+		row_title("CPU instruction parallelism and pipelining");
+
+		row_print(1, 1, "Baseline ticks: %d",
+			baseline_ticks);
+		row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
+			Iphi_cpu_nop_2048x1);
+		row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
+			Iphi_cpu_nop_1024x2);
+		row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
+			Iphi_cpu_nop_512x4);
+		row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
+			Iphi_cpu_nop_256x8);
+		row_print(8, 1, "Iphi cycles for EX/EX: %d",
+			Iphi_cpu_EX_EX);
+		row_print(9, 1, "Iphi cycles for MT/MT: %d",
+			Iphi_cpu_MT_MT);
+		row_print(10, 1, "Iphi cycles for LS/LS: %d",
+			Iphi_cpu_LS_LS);
+
+		fkey_button(1, "RUN");
+		#endif
+
+		dupdate();
+		key = getkey().key;
+
+		if(key == KEY_F1)
+		{
+			Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
+			Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
+			Iphi_cpu_nop_512x4  = Iphi_cycles(perf_cpu_nop_512x4);
+			Iphi_cpu_nop_256x8  = Iphi_cycles(perf_cpu_nop_256x8);
+
+			Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
+			Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
+			Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
+		}
+	}
+}
--- a/src/perf/cpucache.S
+++ b/src/perf/cpucache.S
@ -17,20 +17,27 @@ _cpucache_nop1024:


 # r4: Buffer to read from
-# r5: Buffer size (multiple of 4)
+# r5: Buffer size (multiple of 32)
 # r6: Number of rounds
 _cpucache_rounds:
-	mov	r4, r1
-	add	r5, r1
-	add	#-1, r1
 	mov	r4, r0
+	mov	r5, r2
+	mov	#-5, r3
+	shld	r3, r2

-1:	mov.b	@r0+, r2
-	mov.b	@r0+, r2
-	mov.b	@r0+, r2
-	cmp/ge	r1, r0
-	bf	1b
-	mov.b	@r0+, r2
+	ldrs	1f
+	ldre	2f
+	ldrc	r2
+	nop
+
+1:	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+	mov.l	@r0+, r1
+2:	mov.l	@r0+, r1

 	dt	r6
 	bf	_cpucache_rounds
--- a/src/perf/cpucache.c
+++ b/src/perf/cpucache.c
@ -9,8 +9,9 @@
 #include <libprof.h>

 #include <stdio.h>
+#include <stdlib.h>

-#define CACHE_MAX 4096
+#define CACHE_MAX 65536
 #define SAMPLES 129

 extern void cpucache_nop1024(int repeats);
@ -60,7 +61,9 @@ void gintctl_perf_cpucache(void)
 	uint32_t nop4096 = test_nop4096();
 	nop4096 = test_nop4096();

-	uint8_t buf[CACHE_MAX];
+	uint8_t *buf = malloc(CACHE_MAX);
+	if(!buf) return;
+
 	int32_t x_size[SAMPLES];
 	int32_t y_time[SAMPLES];

@ -103,7 +106,7 @@ void gintctl_perf_cpucache(void)
 			.subtick_divisions = 4,
 		},
 		.ticks_y = {
-			.multiples = 10000,
+			.multiples = 125000,
 			.subtick_divisions = 2,
 		},
 		.grid = {
@ -122,7 +125,7 @@ void gintctl_perf_cpucache(void)
 	for(int i = 0; i < SAMPLES; i++)
 	{
 		x_size[i] = (CACHE_MAX / (SAMPLES-1)) * i;
-		y_time[i] = test_cpucache_rounds(buf, x_size[i], 16);
+		y_time[i] = test_cpucache_rounds(buf, x_size[i], 8);

 		if(y_time[i] < y_min || y_min == -1) y_min = y_time[i];
 		if(y_time[i] > y_max || y_max == -1) y_max = y_time[i];
@ -149,9 +152,9 @@ void gintctl_perf_cpucache(void)
 		plot(&plotspec);

 		row_print(12, 1, "X: Size of buffer (bytes)");
-		row_print(13, 1, "Y: Iphi cycles for 16 8-bit traversals");
+		row_print(13, 1, "Y: Iphi cycles for 8x 32-bit traversals");
 		row_print(14, 1, "Last samples suggests: %.2D Iphi/byte access",
-			100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 16);
+			100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 8);
 		#endif

 		dupdate();