perf/cpu: add CPU pipeline/superscalar parallelism observations

This commit is contained in:
Lephenixnoir 2021-08-05 16:12:36 +02:00
parent 4e748e3c55
commit 12e78d2897
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
5 changed files with 508 additions and 89 deletions

View File

@ -44,6 +44,8 @@ set(SOURCES
src/mem/mem.c
src/perf/cpucache.c
src/perf/cpucache.S
src/perf/cpu.c
src/perf/cpu.S
src/perf/interrupt.c
src/perf/libprof.c
src/perf/memory.c

View File

@ -11,6 +11,9 @@ void gintctl_perf_libprof(void);
/* gintctl_perf_cpucache(): CPU speed and cache size */
void gintctl_perf_cpucache(void);
/* gintctl_perf_cpu(): CPU instruction parallelism and pipelining */
void gintctl_perf_cpu(void);
/* gintctl_perf_interrupts(): Interrupt handling */
void gintctl_perf_interrupts(void);

View File

@ -65,6 +65,8 @@ struct menu menu_perf = {
{ "libprof basics", gintctl_perf_libprof, 0 },
{ "CPU and cache", gintctl_perf_cpucache, 0 },
{ _("CPU parallelism", "Superscalar and pipeline parallelism"),
gintctl_perf_cpu, 0 },
{ "Interrupt stress", gintctl_perf_interrupts, 0 },
{ "Memory access speed", gintctl_perf_memory, 0 },
{ "Rendering functions", gintctl_perf_render, 0 },

View File

@ -1,12 +1,12 @@
# We put all the code in ILRAM to avoid measurement variations caused by code
# being fetched from ROM. The ILRAM is ideal for this task because successive
# instruction accesses take only 1 cycle (assuming no interference, which there
# is none).
/* We put all the code in ILRAM to avoid measurement variations caused by code
being fetched from ROM. The ILRAM is ideal for this task because successive
instruction accesses take only 1 cycle (assuming no interference, which
there is none). */
.section .ilram
# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
# prologue has an even number of instructions, which results in the loop code
# being 4-aligned, which is of extreme importance.
/* Test prologue for COUNT iterations (must be a multiple of 256). Note that
the prologue has an even number of instructions, which results in the loop
code being 4-aligned, which is of extreme importance. */
#define PROLOGUE(COUNT) \
mov #(COUNT/256), r0 ; \
shll8 r0 ; \
@ -15,7 +15,7 @@
ldrc r0 ; \
nop
# Test epilogue
/* Test epilogue */
#define EPILOGUE() \
rts ; \
nop
@ -23,13 +23,16 @@
/* [Baseline]
In this first section, we want to establish an approximate cost of the
setup, which consists of TMU access for libprof, function calls, and the
loop setup for the DSP. */
In this first section, we find an approximate cost of the setup, which
consists of TMU access for libprof, function calls, and the loop setup for
the DSP. This does not include any loop overhead (which is measured later).
This will often take 3~5 Pϕ/4 ticks, which is not a very precise measure,
but helps eliminating noise around tests and bringing cycle counts very
close to multiples of the number of iterations. */
.global _perf_cpu_empty
# Empty setup (0 iterations), as baseline
.align 4
_perf_cpu_empty:
PROLOGUE(0)
@ -38,30 +41,33 @@ _perf_cpu_empty:
/* [Loop control]
In this section, we want to check whether the DSP repeat system has any
added cost per-loop, and we do this by executing the same instructions with
a varying number of DSP repeat jumps.
Here we establish that the DSP repeat system has no added cost per-loop in
favorable situations. That is, the loop is as efficient as if it were
unrolled. This is checked by executing the same sequence of instructions
with a varying number of DSP jumps between them.
The DSP jump has no additional cost, which makes testing much simpler by
avoiding loop unrolls (that would otherwise be needed to amortize the cost
of the jump). In addition, this allows for tighter loops in real-world
programs, and tigher code first better in cache. */
The fact that the DSP jump has no additional cost is very beneficial for
performance measurements, since it means that variations in the size and
iteration count of tests has no influence on the results. (Such influence
would otherwise need to be amortized by unrolling.)
The only observed difference is with the first test where the single
instruction in the loop cannot be executed in parallel with itself in the
next iteration. My guess is that the instruction from the next iteration is
not fetched yet from the perspective of CPU logic. */
.global _perf_cpu_nop_2048x1
.global _perf_cpu_nop_1024x2
.global _perf_cpu_nop_512x4
.global _perf_cpu_nop_256x8
# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
# Parallel execution likely cannot read through the DSP jump for architectural
# reasons; my guess is that the next instruction isn't fetched yet.
/* nop loop (2048 iterations of 1 nop) -> 2 cycles /i */
.align 4
_perf_cpu_nop_2048x1:
PROLOGUE(2048)
1: 2: nop
EPILOGUE()
# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
.global _perf_cpu_nop_1024x2
/* nop loop (1024 iterations of 2 nop) -> 1 cycle /i */
.align 4
_perf_cpu_nop_1024x2:
PROLOGUE(1024)
@ -69,7 +75,9 @@ _perf_cpu_nop_1024x2:
2: nop
EPILOGUE()
# nop loop (512 iterations of 4 nop) -> 2 cycles /i
.global _perf_cpu_nop_512x4
/* nop loop (512 iterations of 4 nop) -> 2 cycles /i */
.align 4
_perf_cpu_nop_512x4:
PROLOGUE(512)
@ -79,7 +87,9 @@ _perf_cpu_nop_512x4:
2: nop
EPILOGUE()
# nop loop (256 iterations of 8 nop) -> 4 cycles/i
.global _perf_cpu_nop_256x8
/* nop loop (256 iterations of 8 nop) -> 4 cycles /i */
.align 4
_perf_cpu_nop_256x8:
PROLOGUE(256)
@ -100,10 +110,8 @@ _perf_cpu_nop_256x8:
pipelines with no extra cycles. */
.global _perf_cpu_EX_EX
.global _perf_cpu_MT_MT
.global _perf_cpu_LS_LS
# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
/* EX/EX -> 2 cycles /i */
.align 4
_perf_cpu_EX_EX:
PROLOGUE(1024)
@ -111,7 +119,9 @@ _perf_cpu_EX_EX:
2: add #0, r1
EPILOGUE()
# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
.global _perf_cpu_MT_MT
/* MT/MT -> 1 cycle /i */
.align 4
_perf_cpu_MT_MT:
PROLOGUE(1024)
@ -119,10 +129,337 @@ _perf_cpu_MT_MT:
2: mov r2, r3
EPILOGUE()
# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
.global _perf_cpu_LS_LS
/* LS/LS -> 2 cycles /i */
.align 4
_perf_cpu_LS_LS:
PROLOGUE(1024)
1: mov.l @r15, r0
2: mov.l @r15, r1
EPILOGUE()
/* [Aligned parallelism]
Here, we show that instruction pairs that are not aligned on 4-byte
boundaries can nonetheless be parallelized. Having an instruction be
executed alone because of a lack of parallel-executability with the next one
does not prevent the next one from forming a parallel pair of its own with
its successor. */
.global _perf_cpu_align_4
/* 2 pairs of parallel instructions -> 2 cycles /i */
.align 4
_perf_cpu_align_4:
PROLOGUE(1024)
1: add #0, r0
mov.l @r15, r1
add #0, r0
2: mov.l @r15, r1
EPILOGUE()
.global _perf_cpu_align_2
/* The add/mov.l pair in the middle is parallelized -> 3 cycles /i */
.align 4
_perf_cpu_align_2:
PROLOGUE(1024)
1: add #0, r0
add #0, r1
mov.l @r15, r0
2: mov.l @r15, r1
EPILOGUE()
/* [Complex pipelines]
Here we measure the behavior of multi-cycle instructions that have complex
pipelines. These test establish that while mac.w occupies one pipeline for 2
cycles, a series of nop can continue to run on the second pipeline.
Even though mac.w has 2 issue cycles and 4 execution cycles, in a sequence
of mac.w each instruction will actually take 3 cycles. I believe this is
because the WB/M2 stage of the second mac.w has a data dependency on the
MS stage of the previous mac.w instruction, which causes a 1-cycle stall.
This assumes that there is no forwarding at the output of the multiplier. */
.global _perf_cpu_pipeline_1
/* nop executes in parallel with first pipeline of mac.w -> 3 cycles /i */
.align 4
_perf_cpu_pipeline_1:
PROLOGUE(1024)
mov r15, r0
mov r15, r1
1: mac.w @r0+, @r1+
2: nop
EPILOGUE()
.global _perf_cpu_pipeline_2
/* Without parallel execution, still 3 cycles per mac.w -> 6 cycles /i */
.align 4
_perf_cpu_pipeline_2:
PROLOGUE(1024)
mov r15, r0
mov r15, r1
1: mac.w @r0+, @r1+
2: mac.w @r0+, @r1+
EPILOGUE()
.global _perf_cpu_pipeline_3
/* mac.w/(nop;nop;nop) then nop/nop -> 4 cycles /i */
.align 4
_perf_cpu_pipeline_3:
PROLOGUE(1024)
mov r15, r0
mov r15, r1
1: mac.w @r0+, @r1+
nop
nop
nop
nop
2: nop
EPILOGUE()
/* [RAW dependencies]
In this section we establish the delay caused by RAW dependencies in
arithmetic and memory access instructions. */
.global _perf_cpu_raw_EX_EX
.align 4
_perf_cpu_raw_EX_EX:
PROLOGUE(1024)
1: add #1, r0
2: add #1, r0
EPILOGUE()
.global _perf_cpu_raw_LS_LS
.align 4
_perf_cpu_raw_LS_LS:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
2: mov.l r0, @r4
EPILOGUE()
.global _perf_cpu_raw_EX_LS
.align 4
_perf_cpu_raw_EX_LS:
PROLOGUE(1024)
mov.l .buffer, r4
mov #0, r0
1: add #1, r0
2: mov.l r0, @r4
EPILOGUE()
.global _perf_cpu_raw_LS_EX
.align 4
_perf_cpu_raw_LS_EX:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
2: add #1, r0
EPILOGUE()
.global _perf_cpu_noraw_LS_LS
.align 4
_perf_cpu_noraw_LS_LS:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
2: mov.l r1, @r4
EPILOGUE()
.global _perf_cpu_noraw_LS_EX
.align 4
_perf_cpu_noraw_LS_EX:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
2: add #1, r1
EPILOGUE()
.global _perf_cpu_raw_EX_LS_addr
.align 4
_perf_cpu_raw_EX_LS_addr:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: add #0, r4
2: mov.l r0, @r4
EPILOGUE()
.global _perf_cpu_raw_DSPLS_DSPLS
.align 4
_perf_cpu_raw_DSPLS_DSPLS:
PROLOGUE(512)
mov.l .buffer, r4
mov r4, r5
1: movs.w @r4, x0
2: movs.w x0, @r5
EPILOGUE()
/* [Iteration weaving]
In this section we analyze how iterations can be woven and opened to improve
performance by reducing RAW dependencies. */
.global _perf_cpu_darken_1
.align 4
_perf_cpu_darken_1:
PROLOGUE(512)
mov.l .buffer, r4
mov r4, r5
add #-4, r5
nop
1: mov.l @r4+, r1
and r2, r1
add #4, r5
shlr r1
2: mov.l r1, @r5
EPILOGUE()
.global _perf_cpu_darken_2
.align 4
_perf_cpu_darken_2:
PROLOGUE(512)
mov.l .buffer, r4
mov r4, r5
add #-4, r5
nop
1: mov.l @r4+, r1
add #4, r5
and r2, r1
shlr r1
2: mov.l r1, @r5
EPILOGUE()
.global _perf_cpu_darken_3
.align 4
_perf_cpu_darken_3:
PROLOGUE(256)
mov.l .buffer, r4
mov r4, r5
add #-8, r5
nop
1: mov.l @r4+, r1
add #8, r5
mov.l @r4+, r3
and r2, r1
shlr r1
mov.l r1, @r5
and r2, r3
shlr r3
2: mov.l r3, @(4,r5)
EPILOGUE()
.global _perf_cpu_darken_4
.align 4
_perf_cpu_darken_4:
PROLOGUE(256)
mov.l .buffer, r4
mov r4, r5
add #-8, r5
mov.l @r4+, r1
/* Loop starts with r1 loaded, finishes with r1 loaded */
1: mov.l @r4+, r3
add #8, r5
and r2, r1
shlr r1
mov.l r1, @r5
mov.l @r4+, r1
and r2, r3
shlr r3
2: mov.l r3, @(4,r5)
EPILOGUE()
/* [Advanced dependencies]
This section measures the delay needed to use registers depending on the
type of instruction which modifies them. */
.global _perf_cpu_double_read
.align 4
_perf_cpu_double_read:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
2: mov.l @r4, r1
EPILOGUE()
.global _perf_cpu_double_incr_read
.align 4
_perf_cpu_double_incr_read:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.b @r4+, r0
2: mov.b @r4+, r0
EPILOGUE()
/* [2D texture copy]
This section is used to investigate the performance of the 2D texture shader
of azur. */
.global _perf_cpu_tex2d
.align 4
_perf_cpu_tex2d:
PROLOGUE(512)
mov.l .buffer2, r3
mov r3, r5 /*.buffer, r5 */
1: movs.l @r3+, x0
2: movs.l x0, @r5+
EPILOGUE()
/* XRAM buffer */
.align 4
.buffer:
.long _cpu_perf_xram_buffer
.buffer2:
.long _buffer2
.section .data
_buffer2:
.zero 2048

View File

@ -5,19 +5,16 @@
#include <gintctl/perf.h>
#include <gintctl/util.h>
#include <gintctl/widgets/gscreen.h>
#include <gintctl/widgets/gtable.h>
#include <libprof.h>
/* Baseline */
void perf_cpu_empty(void);
/* Loop control */
void perf_cpu_nop_2048x1(void);
void perf_cpu_nop_1024x2(void);
void perf_cpu_nop_512x4(void);
void perf_cpu_nop_256x8(void);
/* Parallel execution */
void perf_cpu_EX_EX(void);
void perf_cpu_MT_MT(void);
void perf_cpu_LS_LS(void);
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
GXRAM uint32_t cpu_perf_xram_buffer[512];
/* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
static uint32_t baseline_ticks = 0;
@ -39,14 +36,22 @@ uint32_t Iphi_cycles(void (*function)(void))
/* Number of CPU cycles per iteration; the number of iterations must obviously
match assembler code for that test */
float Iphi_per_iteration(void (*function)(void), int count)
int Iphi_cycles_per_iteration(int total, int count)
{
return (float)Iphi_cycles(function) / count;
div_t d = div(total, count);
if(d.rem < 128)
return d.quot;
if(d.rem > count - 128)
return d.quot + 1;
return -1;
}
/* Number of TMU cycles for an empty function */
uint32_t TMU_baseline(void)
{
void perf_cpu_empty(void);
prof_t perf = prof_make();
for(int i = 0; i < 16; i++)
@ -61,62 +66,132 @@ uint32_t TMU_baseline(void)
//---
struct results {
int nop_2048x1, nop_1024x2, nop_512x4, nop_256x8;
int EX_EX, MT_MT, LS_LS;
int align_4, align_2;
int pipeline_1, pipeline_2, pipeline_3;
int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX;
int noraw_LS_LS, noraw_LS_EX;
int raw_EX_LS_addr, raw_DSPLS_DSPLS;
int darken_1, darken_2, darken_3, darken_4;
int double_read, double_incr_read;
int tex2d;
};
/* Number of Iphi cycles total, and number of iterations */
static struct results r_cycles, r_iter;
static void table_gen(gtable *t, int row)
{
static char const *names[] = {
"Single nop", "2 nop", "4 nop", "8 nop",
"EX/EX pair", "MT/MT pair", "LS/LS pair",
"4-aligned parallel pair", "2-aligned parallel pair",
"mac.w/nop pipeline", "mac.w/mac.w pipeline",
"mac.w/nop*5 pipeline",
"RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS",
"RAW dep.: LS/EX",
"No dep.: LS/LS", "No dep.: LS/EX",
"RAW on address: EX/LS",
"RAW dep.: DSP-LS/DSP-LS",
"32-bit VRAM darken #1", "32-bit VRAM darken #2",
"Interwoven darken", "Interwoven open darken",
"Double read", "Double increment read",
"Texture2D shader",
};
int cycles = ((int *)&r_cycles)[row];
int iter = ((int *)&r_iter)[row];
int cpi = Iphi_cycles_per_iteration(cycles, iter);
char c2[16], c3[16], c4[16];
sprintf(c2, "%d", cpi);
sprintf(c3, "%d", cycles);
sprintf(c4, "%d", iter);
gtable_provide(t, names[row], (cpi == -1 ? "-" : c2), c3, c4);
}
void gintctl_perf_cpu(void)
{
memset(&r_cycles, 0, sizeof r_cycles);
memset(&r_iter, 0, sizeof r_iter);
gtable *table = gtable_create(4, table_gen, NULL, NULL);
gtable_set_rows(table, sizeof r_cycles / sizeof(int));
gtable_set_row_spacing(table, _(1,2));
gtable_set_column_titles(table, "Name", "CPI", "Cycles", "Iter.");
gtable_set_column_sizes(table, 6, 1, 2, 2);
gtable_set_font(table, _(&font_mini, dfont_default()));
jwidget_set_margin(table, 0, 2, 1, 2);
gscreen *scr = gscreen_create2("CPU parallelism", &img_opt_perf_cpu,
"CPU instruction parallelism and pipelining", "@RUN;;;;;");
gscreen_add_tabs(scr, table, table);
jscene_set_focused_widget(scr->scene, table);
int key = 0;
while(key != KEY_EXIT) {
jevent e = jscene_run(scr->scene);
/* Measure baseline time */
baseline_ticks = TMU_baseline();
if(e.type == JSCENE_PAINT) {
dclear(C_WHITE);
jscene_render(scr->scene);
dupdate();
}
uint32_t Iphi_cpu_nop_2048x1 = 0;
uint32_t Iphi_cpu_nop_1024x2 = 0;
uint32_t Iphi_cpu_nop_512x4 = 0;
uint32_t Iphi_cpu_nop_256x8 = 0;
key = 0;
if(e.type == JSCENE_KEY && e.key.type == KEYEV_DOWN)
key = e.key.key;
uint32_t Iphi_cpu_EX_EX = 0;
uint32_t Iphi_cpu_MT_MT = 0;
uint32_t Iphi_cpu_LS_LS = 0;
if(key == KEY_F1) {
baseline_ticks = TMU_baseline();
while(key != KEY_EXIT)
{
dclear(C_WHITE);
#define run(name, iter) { \
extern void perf_cpu_ ## name (void); \
r_cycles.name = Iphi_cycles(perf_cpu_ ## name); \
r_iter.name = iter; \
}
#ifdef FXCG50
row_title("CPU instruction parallelism and pipelining");
run(nop_2048x1, 2048);
run(nop_1024x2, 1024);
run(nop_512x4, 512);
run(nop_256x8, 256);
row_print(1, 1, "Baseline ticks: %d",
baseline_ticks);
row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
Iphi_cpu_nop_2048x1);
row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
Iphi_cpu_nop_1024x2);
row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
Iphi_cpu_nop_512x4);
row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
Iphi_cpu_nop_256x8);
row_print(8, 1, "Iphi cycles for EX/EX: %d",
Iphi_cpu_EX_EX);
row_print(9, 1, "Iphi cycles for MT/MT: %d",
Iphi_cpu_MT_MT);
row_print(10, 1, "Iphi cycles for LS/LS: %d",
Iphi_cpu_LS_LS);
run(EX_EX, 1024);
run(MT_MT, 1024);
run(LS_LS, 1024);
fkey_button(1, "RUN");
#endif
run(align_4, 1024);
run(align_2, 1024);
dupdate();
key = getkey().key;
run(pipeline_1, 1024);
run(pipeline_2, 1024);
run(pipeline_3, 1024);
if(key == KEY_F1)
{
Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
Iphi_cpu_nop_512x4 = Iphi_cycles(perf_cpu_nop_512x4);
Iphi_cpu_nop_256x8 = Iphi_cycles(perf_cpu_nop_256x8);
run(raw_EX_EX, 1024);
run(raw_LS_LS, 1024);
run(raw_EX_LS, 1024);
run(raw_LS_EX, 1024);
run(noraw_LS_LS, 1024);
run(noraw_LS_EX, 1024);
run(raw_EX_LS_addr, 1024);
run(raw_DSPLS_DSPLS, 512);
Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
run(darken_1, 512);
run(darken_2, 512);
run(darken_3, 256);
run(darken_4, 256);
run(double_read, 1024);
run(double_incr_read, 1024);
run(tex2d, 512);
table->widget.update = 1;
}
}
gscreen_destroy(scr);
}