perf/cpucache: evidence the 32-kiB operand cache
This commit is contained in:
parent
ebc9b5c1c2
commit
81adef2785
|
@ -0,0 +1,128 @@
|
|||
# We put all the code in ILRAM to avoid measurement variations caused by code
|
||||
# being fetched from ROM. The ILRAM is ideal for this task because successive
|
||||
# instruction accesses take only 1 cycle (assuming no interference, which there
|
||||
# is none).
|
||||
.section .ilram
|
||||
|
||||
# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
|
||||
# prologue has an even number of instructions, which results in the loop code
|
||||
# being 4-aligned, which is of extreme importance.
|
||||
#define PROLOGUE(COUNT) \
|
||||
mov #(COUNT/256), r0 ; \
|
||||
shll8 r0 ; \
|
||||
ldrs 1f ; \
|
||||
ldre 2f ; \
|
||||
ldrc r0 ; \
|
||||
nop
|
||||
|
||||
# Test epilogue
|
||||
#define EPILOGUE() \
|
||||
rts ; \
|
||||
nop
|
||||
|
||||
|
||||
/* [Baseline]
|
||||
|
||||
In this first section, we want to establish an approximate cost of the
|
||||
setup, which consists of TMU access for libprof, function calls, and the
|
||||
loop setup for the DSP. */
|
||||
|
||||
.global _perf_cpu_empty
|
||||
|
||||
# Empty setup (0 iterations), as baseline
|
||||
.align 4
|
||||
_perf_cpu_empty:
|
||||
PROLOGUE(0)
|
||||
1: 2: EPILOGUE()
|
||||
|
||||
|
||||
/* [Loop control]
|
||||
|
||||
In this section, we want to check whether the DSP repeat system has any
|
||||
added cost per-loop, and we do this by executing the same instructions with
|
||||
a varying number of DSP repeat jumps.
|
||||
|
||||
The DSP jump has no additional cost, which makes testing much simpler by
|
||||
avoiding loop unrolls (that would otherwise be needed to amortize the cost
|
||||
of the jump). In addition, this allows for tighter loops in real-world
|
||||
programs, and tigher code first better in cache. */
|
||||
|
||||
.global _perf_cpu_nop_2048x1
|
||||
.global _perf_cpu_nop_1024x2
|
||||
.global _perf_cpu_nop_512x4
|
||||
.global _perf_cpu_nop_256x8
|
||||
|
||||
# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
|
||||
# Parallel execution likely cannot read through the DSP jump for architectural
|
||||
# reasons; my guess is that the next instruction isn't fetched yet.
|
||||
.align 4
|
||||
_perf_cpu_nop_2048x1:
|
||||
PROLOGUE(2048)
|
||||
1: 2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
|
||||
.align 4
|
||||
_perf_cpu_nop_1024x2:
|
||||
PROLOGUE(1024)
|
||||
1: nop
|
||||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (512 iterations of 4 nop) -> 2 cycles /i
|
||||
.align 4
|
||||
_perf_cpu_nop_512x4:
|
||||
PROLOGUE(512)
|
||||
1: nop
|
||||
nop
|
||||
nop
|
||||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (256 iterations of 8 nop) -> 4 cycles/i
|
||||
.align 4
|
||||
_perf_cpu_nop_256x8:
|
||||
PROLOGUE(256)
|
||||
1: nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
/* [Parallel execution]
|
||||
|
||||
In this section, we reproduce simple cases of superscalar parallelism for
|
||||
instructions of different types, using only instructions that have trivial
|
||||
pipelines with no extra cycles. */
|
||||
|
||||
.global _perf_cpu_EX_EX
|
||||
.global _perf_cpu_MT_MT
|
||||
.global _perf_cpu_LS_LS
|
||||
|
||||
# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
|
||||
.align 4
|
||||
_perf_cpu_EX_EX:
|
||||
PROLOGUE(1024)
|
||||
1: add #0, r0
|
||||
2: add #0, r1
|
||||
EPILOGUE()
|
||||
|
||||
# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
|
||||
.align 4
|
||||
_perf_cpu_MT_MT:
|
||||
PROLOGUE(1024)
|
||||
1: mov r0, r1
|
||||
2: mov r2, r3
|
||||
EPILOGUE()
|
||||
|
||||
# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
|
||||
.align 4
|
||||
_perf_cpu_LS_LS:
|
||||
PROLOGUE(1024)
|
||||
1: mov.l @r15, r0
|
||||
2: mov.l @r15, r1
|
||||
EPILOGUE()
|
|
@ -0,0 +1,122 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/keyboard.h>
|
||||
#include <gint/clock.h>
|
||||
|
||||
#include <gintctl/perf.h>
|
||||
#include <gintctl/util.h>
|
||||
|
||||
#include <libprof.h>
|
||||
|
||||
/* Baseline */
|
||||
void perf_cpu_empty(void);
|
||||
/* Loop control */
|
||||
void perf_cpu_nop_2048x1(void);
|
||||
void perf_cpu_nop_1024x2(void);
|
||||
void perf_cpu_nop_512x4(void);
|
||||
void perf_cpu_nop_256x8(void);
|
||||
/* Parallel execution */
|
||||
void perf_cpu_EX_EX(void);
|
||||
void perf_cpu_MT_MT(void);
|
||||
void perf_cpu_LS_LS(void);
|
||||
|
||||
/* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
|
||||
static uint32_t baseline_ticks = 0;
|
||||
|
||||
/* Number of CPU cycles spent executing a function */
|
||||
uint32_t Iphi_cycles(void (*function)(void))
|
||||
{
|
||||
prof_t perf = prof_make();
|
||||
|
||||
prof_enter(perf);
|
||||
(*function)();
|
||||
prof_leave(perf);
|
||||
|
||||
clock_frequency_t const *freq = clock_freq();
|
||||
uint32_t TMU_cycles = perf.elapsed - baseline_ticks;
|
||||
uint32_t PLL_cycles = (TMU_cycles * 4) * freq->Pphi_div;
|
||||
return PLL_cycles / freq->Iphi_div;
|
||||
}
|
||||
|
||||
/* Number of CPU cycles per iteration; the number of iterations must obviously
|
||||
match assembler code for that test */
|
||||
float Iphi_per_iteration(void (*function)(void), int count)
|
||||
{
|
||||
return (float)Iphi_cycles(function) / count;
|
||||
}
|
||||
|
||||
/* Number of TMU cycles for an empty function */
|
||||
uint32_t TMU_baseline(void)
|
||||
{
|
||||
prof_t perf = prof_make();
|
||||
|
||||
for(int i = 0; i < 16; i++)
|
||||
{
|
||||
prof_enter(perf);
|
||||
perf_cpu_empty();
|
||||
prof_leave(perf);
|
||||
}
|
||||
|
||||
return perf.elapsed / 16;
|
||||
}
|
||||
|
||||
//---
|
||||
|
||||
void gintctl_perf_cpu(void)
|
||||
{
|
||||
int key = 0;
|
||||
|
||||
/* Measure baseline time */
|
||||
baseline_ticks = TMU_baseline();
|
||||
|
||||
uint32_t Iphi_cpu_nop_2048x1 = 0;
|
||||
uint32_t Iphi_cpu_nop_1024x2 = 0;
|
||||
uint32_t Iphi_cpu_nop_512x4 = 0;
|
||||
uint32_t Iphi_cpu_nop_256x8 = 0;
|
||||
|
||||
uint32_t Iphi_cpu_EX_EX = 0;
|
||||
uint32_t Iphi_cpu_MT_MT = 0;
|
||||
uint32_t Iphi_cpu_LS_LS = 0;
|
||||
|
||||
while(key != KEY_EXIT)
|
||||
{
|
||||
dclear(C_WHITE);
|
||||
|
||||
#ifdef FXCG50
|
||||
row_title("CPU instruction parallelism and pipelining");
|
||||
|
||||
row_print(1, 1, "Baseline ticks: %d",
|
||||
baseline_ticks);
|
||||
row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
|
||||
Iphi_cpu_nop_2048x1);
|
||||
row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
|
||||
Iphi_cpu_nop_1024x2);
|
||||
row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
|
||||
Iphi_cpu_nop_512x4);
|
||||
row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
|
||||
Iphi_cpu_nop_256x8);
|
||||
row_print(8, 1, "Iphi cycles for EX/EX: %d",
|
||||
Iphi_cpu_EX_EX);
|
||||
row_print(9, 1, "Iphi cycles for MT/MT: %d",
|
||||
Iphi_cpu_MT_MT);
|
||||
row_print(10, 1, "Iphi cycles for LS/LS: %d",
|
||||
Iphi_cpu_LS_LS);
|
||||
|
||||
fkey_button(1, "RUN");
|
||||
#endif
|
||||
|
||||
dupdate();
|
||||
key = getkey().key;
|
||||
|
||||
if(key == KEY_F1)
|
||||
{
|
||||
Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
|
||||
Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
|
||||
Iphi_cpu_nop_512x4 = Iphi_cycles(perf_cpu_nop_512x4);
|
||||
Iphi_cpu_nop_256x8 = Iphi_cycles(perf_cpu_nop_256x8);
|
||||
|
||||
Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
|
||||
Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
|
||||
Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,20 +17,27 @@ _cpucache_nop1024:
|
|||
|
||||
|
||||
# r4: Buffer to read from
|
||||
# r5: Buffer size (multiple of 4)
|
||||
# r5: Buffer size (multiple of 32)
|
||||
# r6: Number of rounds
|
||||
_cpucache_rounds:
|
||||
mov r4, r1
|
||||
add r5, r1
|
||||
add #-1, r1
|
||||
mov r4, r0
|
||||
mov r5, r2
|
||||
mov #-5, r3
|
||||
shld r3, r2
|
||||
|
||||
1: mov.b @r0+, r2
|
||||
mov.b @r0+, r2
|
||||
mov.b @r0+, r2
|
||||
cmp/ge r1, r0
|
||||
bf 1b
|
||||
mov.b @r0+, r2
|
||||
ldrs 1f
|
||||
ldre 2f
|
||||
ldrc r2
|
||||
nop
|
||||
|
||||
1: mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
mov.l @r0+, r1
|
||||
2: mov.l @r0+, r1
|
||||
|
||||
dt r6
|
||||
bf _cpucache_rounds
|
||||
|
|
|
@ -9,8 +9,9 @@
|
|||
#include <libprof.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define CACHE_MAX 4096
|
||||
#define CACHE_MAX 65536
|
||||
#define SAMPLES 129
|
||||
|
||||
extern void cpucache_nop1024(int repeats);
|
||||
|
@ -60,7 +61,9 @@ void gintctl_perf_cpucache(void)
|
|||
uint32_t nop4096 = test_nop4096();
|
||||
nop4096 = test_nop4096();
|
||||
|
||||
uint8_t buf[CACHE_MAX];
|
||||
uint8_t *buf = malloc(CACHE_MAX);
|
||||
if(!buf) return;
|
||||
|
||||
int32_t x_size[SAMPLES];
|
||||
int32_t y_time[SAMPLES];
|
||||
|
||||
|
@ -103,7 +106,7 @@ void gintctl_perf_cpucache(void)
|
|||
.subtick_divisions = 4,
|
||||
},
|
||||
.ticks_y = {
|
||||
.multiples = 10000,
|
||||
.multiples = 125000,
|
||||
.subtick_divisions = 2,
|
||||
},
|
||||
.grid = {
|
||||
|
@ -122,7 +125,7 @@ void gintctl_perf_cpucache(void)
|
|||
for(int i = 0; i < SAMPLES; i++)
|
||||
{
|
||||
x_size[i] = (CACHE_MAX / (SAMPLES-1)) * i;
|
||||
y_time[i] = test_cpucache_rounds(buf, x_size[i], 16);
|
||||
y_time[i] = test_cpucache_rounds(buf, x_size[i], 8);
|
||||
|
||||
if(y_time[i] < y_min || y_min == -1) y_min = y_time[i];
|
||||
if(y_time[i] > y_max || y_max == -1) y_max = y_time[i];
|
||||
|
@ -149,9 +152,9 @@ void gintctl_perf_cpucache(void)
|
|||
plot(&plotspec);
|
||||
|
||||
row_print(12, 1, "X: Size of buffer (bytes)");
|
||||
row_print(13, 1, "Y: Iphi cycles for 16 8-bit traversals");
|
||||
row_print(13, 1, "Y: Iphi cycles for 8x 32-bit traversals");
|
||||
row_print(14, 1, "Last samples suggests: %.2D Iphi/byte access",
|
||||
100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 16);
|
||||
100 * y_time[SAMPLES-1] / x_size[SAMPLES-1] / 8);
|
||||
#endif
|
||||
|
||||
dupdate();
|
||||
|
|
Loading…
Reference in New Issue