perf/cpucache: evidence the 32-kiB operand cache

This commit is contained in:
Lephenixnoir 2021-06-21 09:39:36 +02:00
parent ebc9b5c1c2
commit 81adef2785
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
4 changed files with 276 additions and 16 deletions

128
src/perf/cpu.S Normal file
View File

@ -0,0 +1,128 @@
# We put all the code in ILRAM to avoid measurement variations caused by code
# being fetched from ROM. The ILRAM is ideal for this task because successive
# instruction accesses take only 1 cycle (assuming no interference, which there
# is none).
.section .ilram
# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
# prologue has an even number of instructions, which results in the loop code
# being 4-aligned, which is of extreme importance.
#define PROLOGUE(COUNT) \
mov #(COUNT/256), r0 ; \
shll8 r0 ; \
ldrs 1f ; \
ldre 2f ; \
ldrc r0 ; \
nop
# Test epilogue
#define EPILOGUE() \
rts ; \
nop
/* [Baseline]
In this first section, we want to establish an approximate cost of the
setup, which consists of TMU access for libprof, function calls, and the
loop setup for the DSP. */
.global _perf_cpu_empty
# Empty setup (0 iterations), as baseline
.align 4
_perf_cpu_empty:
PROLOGUE(0)
1: 2: EPILOGUE()
/* [Loop control]
In this section, we want to check whether the DSP repeat system has any
added cost per-loop, and we do this by executing the same instructions with
a varying number of DSP repeat jumps.
The DSP jump has no additional cost, which makes testing much simpler by
avoiding loop unrolls (that would otherwise be needed to amortize the cost
of the jump). In addition, this allows for tighter loops in real-world
programs, and tigher code first better in cache. */
.global _perf_cpu_nop_2048x1
.global _perf_cpu_nop_1024x2
.global _perf_cpu_nop_512x4
.global _perf_cpu_nop_256x8
# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
# Parallel execution likely cannot read through the DSP jump for architectural
# reasons; my guess is that the next instruction isn't fetched yet.
.align 4
_perf_cpu_nop_2048x1:
PROLOGUE(2048)
1: 2: nop
EPILOGUE()
# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
.align 4
_perf_cpu_nop_1024x2:
PROLOGUE(1024)
1: nop
2: nop
EPILOGUE()
# nop loop (512 iterations of 4 nop) -> 2 cycles /i
.align 4
_perf_cpu_nop_512x4:
PROLOGUE(512)
1: nop
nop
nop
2: nop
EPILOGUE()
# nop loop (256 iterations of 8 nop) -> 4 cycles/i
.align 4
_perf_cpu_nop_256x8:
PROLOGUE(256)
1: nop
nop
nop
nop
nop
nop
nop
2: nop
EPILOGUE()
/* [Parallel execution]
In this section, we reproduce simple cases of superscalar parallelism for
instructions of different types, using only instructions that have trivial
pipelines with no extra cycles. */
.global _perf_cpu_EX_EX
.global _perf_cpu_MT_MT
.global _perf_cpu_LS_LS
# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
.align 4
_perf_cpu_EX_EX:
PROLOGUE(1024)
1: add #0, r0
2: add #0, r1
EPILOGUE()
# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
.align 4
_perf_cpu_MT_MT:
PROLOGUE(1024)
1: mov r0, r1
2: mov r2, r3
EPILOGUE()
# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
.align 4
_perf_cpu_LS_LS:
PROLOGUE(1024)
1: mov.l @r15, r0
2: mov.l @r15, r1
EPILOGUE()

122
src/perf/cpu.c Normal file
View File

@ -0,0 +1,122 @@
#include <gint/display.h>
#include <gint/keyboard.h>
#include <gint/clock.h>
#include <gintctl/perf.h>
#include <gintctl/util.h>
#include <libprof.h>
/* Baseline */
void perf_cpu_empty(void);
/* Loop control */
void perf_cpu_nop_2048x1(void);
void perf_cpu_nop_1024x2(void);
void perf_cpu_nop_512x4(void);
void perf_cpu_nop_256x8(void);
/* Parallel execution */
void perf_cpu_EX_EX(void);
void perf_cpu_MT_MT(void);
void perf_cpu_LS_LS(void);
/* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
static uint32_t baseline_ticks = 0;
/* Number of CPU cycles spent executing a function */
uint32_t Iphi_cycles(void (*function)(void))
{
prof_t perf = prof_make();
prof_enter(perf);
(*function)();
prof_leave(perf);
clock_frequency_t const *freq = clock_freq();
uint32_t TMU_cycles = perf.elapsed - baseline_ticks;
uint32_t PLL_cycles = (TMU_cycles * 4) * freq->Pphi_div;
return PLL_cycles / freq->Iphi_div;
}
/* Number of CPU cycles per iteration; the number of iterations must obviously
match assembler code for that test */
float Iphi_per_iteration(void (*function)(void), int count)
{
return (float)Iphi_cycles(function) / count;
}
/* Number of TMU cycles for an empty function */
uint32_t TMU_baseline(void)
{
prof_t perf = prof_make();
for(int i = 0; i < 16; i++)
{
prof_enter(perf);
perf_cpu_empty();
prof_leave(perf);
}
return perf.elapsed / 16;
}
//---
void gintctl_perf_cpu(void)
{
int key = 0;
/* Measure baseline time */
baseline_ticks = TMU_baseline();
uint32_t Iphi_cpu_nop_2048x1 = 0;
uint32_t Iphi_cpu_nop_1024x2 = 0;
uint32_t Iphi_cpu_nop_512x4 = 0;
uint32_t Iphi_cpu_nop_256x8 = 0;
uint32_t Iphi_cpu_EX_EX = 0;
uint32_t Iphi_cpu_MT_MT = 0;
uint32_t Iphi_cpu_LS_LS = 0;
while(key != KEY_EXIT)
{
dclear(C_WHITE);
#ifdef FXCG50
row_title("CPU instruction parallelism and pipelining");
row_print(1, 1, "Baseline ticks: %d",
baseline_ticks);
row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
Iphi_cpu_nop_2048x1);
row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
Iphi_cpu_nop_1024x2);
row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
Iphi_cpu_nop_512x4);
row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
Iphi_cpu_nop_256x8);
row_print(8, 1, "Iphi cycles for EX/EX: %d",
Iphi_cpu_EX_EX);
row_print(9, 1, "Iphi cycles for MT/MT: %d",
Iphi_cpu_MT_MT);
row_print(10, 1, "Iphi cycles for LS/LS: %d",
Iphi_cpu_LS_LS);
fkey_button(1, "RUN");
#endif
dupdate();
key = getkey().key;
if(key == KEY_F1)
{
Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
Iphi_cpu_nop_512x4 = Iphi_cycles(perf_cpu_nop_512x4);
Iphi_cpu_nop_256x8 = Iphi_cycles(perf_cpu_nop_256x8);
Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
}
}
}

View File

@ -17,20 +17,27 @@ _cpucache_nop1024:
# r4: Buffer to read from
# r5: Buffer size (multiple of 4)
# r5: Buffer size (multiple of 32)
# r6: Number of rounds
_cpucache_rounds:
mov r4, r1
add r5, r1
add #-1, r1
mov r4, r0
mov r5, r2
mov #-5, r3
shld r3, r2
1: mov.b @r0+, r2
mov.b @r0+, r2
mov.b @r0+, r2
cmp/ge r1, r0
bf 1b
mov.b @r0+, r2
ldrs 1f
ldre 2f
ldrc r2
nop
1: mov.l @r0+, r1
mov.l @r0+, r1
mov.l @r0+, r1
mov.l @r0+, r1
mov.l @r0+, r1
mov.l @r0+, r1
mov.l @r0+, r1
2: mov.l @r0+, r1
dt r6
bf _cpucache_rounds

View File

@ -9,8 +9,9 @@
#include <libprof.h>
#include <stdio.h>
#include <stdlib.h>
#define CACHE_MAX 4096
#define CACHE_MAX 65536
#define SAMPLES 129
extern void cpucache_nop1024(int repeats);
@ -60,7 +61,9 @@ void gintctl_perf_cpucache(void)
uint32_t nop4096 = test_nop4096();
nop4096 = test_nop4096();
uint8_t buf[CACHE_MAX];
uint8_t *buf = malloc(CACHE_MAX);
if(!buf) return;
int32_t x_size[SAMPLES];
int32_t y_time[SAMPLES];
@ -103,7 +106,7 @@ void gintctl_perf_cpucache(void)
.subtick_divisions = 4,
},
.ticks_y = {
.multiples = 10000,
.multiples = 125000,
.subtick_divisions = 2,
},
.grid = {
@ -122,7 +125,7 @@ void gintctl_perf_cpucache(void)
for(int i = 0; i < SAMPLES; i++)
{
x_size[i] = (CACHE_MAX / (SAMPLES-1)) * i;
y_time[i] = test_cpucache_rounds(buf, x_size[i], 16);
y_time[i] = test_cpucache_rounds(buf, x_size[i], 8);
if(y_time[i] < y_min || y_min == -1) y_min = y_time[i];
if(y_time[i] > y_max || y_max == -1) y_max = y_time[i];
@ -149,9 +152,9 @@ void gintctl_perf_cpucache(void)
plot(&plotspec);
row_print(12, 1, "X: Size of buffer (bytes)");
row_print(13, 1, "Y: Iphi cycles for 16 8-bit traversals");
row_print(13, 1, "Y: Iphi cycles for 8x 32-bit traversals");
row_print(14, 1, "Last samples suggests: %.2D Iphi/byte access",
100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 16);
100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 8);
#endif
dupdate();