From 12e78d2897252aaf1d3ccfaba16f79f43b9cfa7d Mon Sep 17 00:00:00 2001 From: Lephenixnoir Date: Thu, 5 Aug 2021 16:12:36 +0200 Subject: [PATCH] perf/cpu: add CPU pipeline/superscalar parallelism observations --- CMakeLists.txt | 2 + include/gintctl/perf.h | 3 + src/gintctl.c | 2 + src/perf/cpu.S | 403 +++++++++++++++++++++++++++++++++++++---- src/perf/cpu.c | 187 +++++++++++++------ 5 files changed, 508 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 524d8d9..8fc8fbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,8 @@ set(SOURCES src/mem/mem.c src/perf/cpucache.c src/perf/cpucache.S + src/perf/cpu.c + src/perf/cpu.S src/perf/interrupt.c src/perf/libprof.c src/perf/memory.c diff --git a/include/gintctl/perf.h b/include/gintctl/perf.h index d475730..1903044 100644 --- a/include/gintctl/perf.h +++ b/include/gintctl/perf.h @@ -11,6 +11,9 @@ void gintctl_perf_libprof(void); /* gintctl_perf_cpucache(): CPU speed and cache size */ void gintctl_perf_cpucache(void); +/* gintctl_perf_cpu(): CPU instruction parallelism and pipelining */ +void gintctl_perf_cpu(void); + /* gintctl_perf_interrupts(): Interrupt handling */ void gintctl_perf_interrupts(void); diff --git a/src/gintctl.c b/src/gintctl.c index ac44801..2a66104 100644 --- a/src/gintctl.c +++ b/src/gintctl.c @@ -65,6 +65,8 @@ struct menu menu_perf = { { "libprof basics", gintctl_perf_libprof, 0 }, { "CPU and cache", gintctl_perf_cpucache, 0 }, + { _("CPU parallelism", "Superscalar and pipeline parallelism"), + gintctl_perf_cpu, 0 }, { "Interrupt stress", gintctl_perf_interrupts, 0 }, { "Memory access speed", gintctl_perf_memory, 0 }, { "Rendering functions", gintctl_perf_render, 0 }, diff --git a/src/perf/cpu.S b/src/perf/cpu.S index a7afa6b..67a97db 100644 --- a/src/perf/cpu.S +++ b/src/perf/cpu.S @@ -1,12 +1,12 @@ -# We put all the code in ILRAM to avoid measurement variations caused by code -# being fetched from ROM. The ILRAM is ideal for this task because successive -# instruction accesses take only 1 cycle (assuming no interference, which there -# is none). +/* We put all the code in ILRAM to avoid measurement variations caused by code + being fetched from ROM. The ILRAM is ideal for this task because successive + instruction accesses take only 1 cycle (assuming no interference, which + there is none). */ .section .ilram -# Test prologue for COUNT iterations (must be a multiple of 256). Note that the -# prologue has an even number of instructions, which results in the loop code -# being 4-aligned, which is of extreme importance. +/* Test prologue for COUNT iterations (must be a multiple of 256). Note that + the prologue has an even number of instructions, which results in the loop + code being 4-aligned, which is of extreme importance. */ #define PROLOGUE(COUNT) \ mov #(COUNT/256), r0 ; \ shll8 r0 ; \ @@ -15,7 +15,7 @@ ldrc r0 ; \ nop -# Test epilogue +/* Test epilogue */ #define EPILOGUE() \ rts ; \ nop @@ -23,13 +23,16 @@ /* [Baseline] - In this first section, we want to establish an approximate cost of the - setup, which consists of TMU access for libprof, function calls, and the - loop setup for the DSP. */ + In this first section, we find an approximate cost of the setup, which + consists of TMU access for libprof, function calls, and the loop setup for + the DSP. This does not include any loop overhead (which is measured later). + + This will often take 3~5 Pϕ/4 ticks, which is not a very precise measure, + but helps eliminating noise around tests and bringing cycle counts very + close to multiples of the number of iterations. */ .global _perf_cpu_empty -# Empty setup (0 iterations), as baseline .align 4 _perf_cpu_empty: PROLOGUE(0) @@ -38,30 +41,33 @@ _perf_cpu_empty: /* [Loop control] - In this section, we want to check whether the DSP repeat system has any - added cost per-loop, and we do this by executing the same instructions with - a varying number of DSP repeat jumps. + Here we establish that the DSP repeat system has no added cost per-loop in + favorable situations. That is, the loop is as efficient as if it were + unrolled. This is checked by executing the same sequence of instructions + with a varying number of DSP jumps between them. - The DSP jump has no additional cost, which makes testing much simpler by - avoiding loop unrolls (that would otherwise be needed to amortize the cost - of the jump). In addition, this allows for tighter loops in real-world - programs, and tigher code first better in cache. */ + The fact that the DSP jump has no additional cost is very beneficial for + performance measurements, since it means that variations in the size and + iteration count of tests has no influence on the results. (Such influence + would otherwise need to be amortized by unrolling.) + + The only observed difference is with the first test where the single + instruction in the loop cannot be executed in parallel with itself in the + next iteration. My guess is that the instruction from the next iteration is + not fetched yet from the perspective of CPU logic. */ .global _perf_cpu_nop_2048x1 -.global _perf_cpu_nop_1024x2 -.global _perf_cpu_nop_512x4 -.global _perf_cpu_nop_256x8 -# nop loop (2048 iterations of 1 nop) -> 2 cycles /i -# Parallel execution likely cannot read through the DSP jump for architectural -# reasons; my guess is that the next instruction isn't fetched yet. +/* nop loop (2048 iterations of 1 nop) -> 2 cycles /i */ .align 4 _perf_cpu_nop_2048x1: PROLOGUE(2048) 1: 2: nop EPILOGUE() -# nop loop (1024 iterations of 2 nop) -> 1 cycle /i +.global _perf_cpu_nop_1024x2 + +/* nop loop (1024 iterations of 2 nop) -> 1 cycle /i */ .align 4 _perf_cpu_nop_1024x2: PROLOGUE(1024) @@ -69,7 +75,9 @@ _perf_cpu_nop_1024x2: 2: nop EPILOGUE() -# nop loop (512 iterations of 4 nop) -> 2 cycles /i +.global _perf_cpu_nop_512x4 + +/* nop loop (512 iterations of 4 nop) -> 2 cycles /i */ .align 4 _perf_cpu_nop_512x4: PROLOGUE(512) @@ -79,7 +87,9 @@ _perf_cpu_nop_512x4: 2: nop EPILOGUE() -# nop loop (256 iterations of 8 nop) -> 4 cycles/i +.global _perf_cpu_nop_256x8 + +/* nop loop (256 iterations of 8 nop) -> 4 cycles /i */ .align 4 _perf_cpu_nop_256x8: PROLOGUE(256) @@ -100,10 +110,8 @@ _perf_cpu_nop_256x8: pipelines with no extra cycles. */ .global _perf_cpu_EX_EX -.global _perf_cpu_MT_MT -.global _perf_cpu_LS_LS -# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i +/* EX/EX -> 2 cycles /i */ .align 4 _perf_cpu_EX_EX: PROLOGUE(1024) @@ -111,7 +119,9 @@ _perf_cpu_EX_EX: 2: add #0, r1 EPILOGUE() -# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i +.global _perf_cpu_MT_MT + +/* MT/MT -> 1 cycle /i */ .align 4 _perf_cpu_MT_MT: PROLOGUE(1024) @@ -119,10 +129,337 @@ _perf_cpu_MT_MT: 2: mov r2, r3 EPILOGUE() -# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i +.global _perf_cpu_LS_LS + +/* LS/LS -> 2 cycles /i */ .align 4 _perf_cpu_LS_LS: PROLOGUE(1024) 1: mov.l @r15, r0 2: mov.l @r15, r1 EPILOGUE() + +/* [Aligned parallelism] + + Here, we show that instruction pairs that are not aligned on 4-byte + boundaries can nonetheless be parallelized. Having an instruction be + executed alone because of a lack of parallel-executability with the next one + does not prevent the next one from forming a parallel pair of its own with + its successor. */ + +.global _perf_cpu_align_4 + +/* 2 pairs of parallel instructions -> 2 cycles /i */ +.align 4 +_perf_cpu_align_4: + PROLOGUE(1024) +1: add #0, r0 + mov.l @r15, r1 + add #0, r0 +2: mov.l @r15, r1 + EPILOGUE() + +.global _perf_cpu_align_2 + +/* The add/mov.l pair in the middle is parallelized -> 3 cycles /i */ +.align 4 +_perf_cpu_align_2: + PROLOGUE(1024) +1: add #0, r0 + add #0, r1 + mov.l @r15, r0 +2: mov.l @r15, r1 + EPILOGUE() + +/* [Complex pipelines] + + Here we measure the behavior of multi-cycle instructions that have complex + pipelines. These test establish that while mac.w occupies one pipeline for 2 + cycles, a series of nop can continue to run on the second pipeline. + + Even though mac.w has 2 issue cycles and 4 execution cycles, in a sequence + of mac.w each instruction will actually take 3 cycles. I believe this is + because the WB/M2 stage of the second mac.w has a data dependency on the + MS stage of the previous mac.w instruction, which causes a 1-cycle stall. + This assumes that there is no forwarding at the output of the multiplier. */ + +.global _perf_cpu_pipeline_1 + +/* nop executes in parallel with first pipeline of mac.w -> 3 cycles /i */ +.align 4 +_perf_cpu_pipeline_1: + PROLOGUE(1024) + mov r15, r0 + mov r15, r1 + +1: mac.w @r0+, @r1+ +2: nop + EPILOGUE() + +.global _perf_cpu_pipeline_2 + +/* Without parallel execution, still 3 cycles per mac.w -> 6 cycles /i */ +.align 4 +_perf_cpu_pipeline_2: + PROLOGUE(1024) + mov r15, r0 + mov r15, r1 + +1: mac.w @r0+, @r1+ +2: mac.w @r0+, @r1+ + EPILOGUE() + +.global _perf_cpu_pipeline_3 + +/* mac.w/(nop;nop;nop) then nop/nop -> 4 cycles /i */ +.align 4 +_perf_cpu_pipeline_3: + PROLOGUE(1024) + mov r15, r0 + mov r15, r1 + +1: mac.w @r0+, @r1+ + nop + nop + nop + nop +2: nop + EPILOGUE() + +/* [RAW dependencies] + + In this section we establish the delay caused by RAW dependencies in + arithmetic and memory access instructions. */ + +.global _perf_cpu_raw_EX_EX + +.align 4 +_perf_cpu_raw_EX_EX: + PROLOGUE(1024) +1: add #1, r0 +2: add #1, r0 + EPILOGUE() + +.global _perf_cpu_raw_LS_LS + +.align 4 +_perf_cpu_raw_LS_LS: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.l @r4, r0 +2: mov.l r0, @r4 + EPILOGUE() + +.global _perf_cpu_raw_EX_LS + +.align 4 +_perf_cpu_raw_EX_LS: + PROLOGUE(1024) + mov.l .buffer, r4 + mov #0, r0 + +1: add #1, r0 +2: mov.l r0, @r4 + EPILOGUE() + +.global _perf_cpu_raw_LS_EX + +.align 4 +_perf_cpu_raw_LS_EX: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.l @r4, r0 +2: add #1, r0 + EPILOGUE() + +.global _perf_cpu_noraw_LS_LS + +.align 4 +_perf_cpu_noraw_LS_LS: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.l @r4, r0 +2: mov.l r1, @r4 + EPILOGUE() + +.global _perf_cpu_noraw_LS_EX + +.align 4 +_perf_cpu_noraw_LS_EX: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.l @r4, r0 +2: add #1, r1 + EPILOGUE() + +.global _perf_cpu_raw_EX_LS_addr + +.align 4 +_perf_cpu_raw_EX_LS_addr: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: add #0, r4 +2: mov.l r0, @r4 + EPILOGUE() + +.global _perf_cpu_raw_DSPLS_DSPLS + +.align 4 +_perf_cpu_raw_DSPLS_DSPLS: + PROLOGUE(512) + mov.l .buffer, r4 + mov r4, r5 + +1: movs.w @r4, x0 +2: movs.w x0, @r5 + EPILOGUE() + +/* [Iteration weaving] + + In this section we analyze how iterations can be woven and opened to improve + performance by reducing RAW dependencies. */ + +.global _perf_cpu_darken_1 + +.align 4 +_perf_cpu_darken_1: + PROLOGUE(512) + mov.l .buffer, r4 + mov r4, r5 + add #-4, r5 + nop + +1: mov.l @r4+, r1 + and r2, r1 + add #4, r5 + shlr r1 +2: mov.l r1, @r5 + EPILOGUE() + +.global _perf_cpu_darken_2 + +.align 4 +_perf_cpu_darken_2: + PROLOGUE(512) + mov.l .buffer, r4 + mov r4, r5 + add #-4, r5 + nop + +1: mov.l @r4+, r1 + add #4, r5 + and r2, r1 + shlr r1 +2: mov.l r1, @r5 + EPILOGUE() + +.global _perf_cpu_darken_3 + +.align 4 +_perf_cpu_darken_3: + PROLOGUE(256) + mov.l .buffer, r4 + mov r4, r5 + add #-8, r5 + nop + +1: mov.l @r4+, r1 + add #8, r5 + mov.l @r4+, r3 + and r2, r1 + shlr r1 + mov.l r1, @r5 + and r2, r3 + shlr r3 +2: mov.l r3, @(4,r5) + EPILOGUE() + +.global _perf_cpu_darken_4 + +.align 4 +_perf_cpu_darken_4: + PROLOGUE(256) + mov.l .buffer, r4 + mov r4, r5 + add #-8, r5 + mov.l @r4+, r1 + + /* Loop starts with r1 loaded, finishes with r1 loaded */ +1: mov.l @r4+, r3 + add #8, r5 + and r2, r1 + shlr r1 + mov.l r1, @r5 + mov.l @r4+, r1 + and r2, r3 + shlr r3 +2: mov.l r3, @(4,r5) + EPILOGUE() + +/* [Advanced dependencies] + + This section measures the delay needed to use registers depending on the + type of instruction which modifies them. */ + +.global _perf_cpu_double_read + +.align 4 +_perf_cpu_double_read: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.l @r4, r0 +2: mov.l @r4, r1 + EPILOGUE() + +.global _perf_cpu_double_incr_read + +.align 4 +_perf_cpu_double_incr_read: + PROLOGUE(1024) + mov.l .buffer, r4 + nop + +1: mov.b @r4+, r0 +2: mov.b @r4+, r0 + EPILOGUE() + +/* [2D texture copy] + + This section is used to investigate the performance of the 2D texture shader + of azur. */ + +.global _perf_cpu_tex2d + +.align 4 +_perf_cpu_tex2d: + PROLOGUE(512) + mov.l .buffer2, r3 + mov r3, r5 /*.buffer, r5 */ + +1: movs.l @r3+, x0 +2: movs.l x0, @r5+ + EPILOGUE() + +/* XRAM buffer */ + +.align 4 +.buffer: + .long _cpu_perf_xram_buffer +.buffer2: + .long _buffer2 + +.section .data +_buffer2: + .zero 2048 diff --git a/src/perf/cpu.c b/src/perf/cpu.c index aa7e323..c7565b7 100644 --- a/src/perf/cpu.c +++ b/src/perf/cpu.c @@ -5,19 +5,16 @@ #include #include +#include +#include + #include -/* Baseline */ -void perf_cpu_empty(void); -/* Loop control */ -void perf_cpu_nop_2048x1(void); -void perf_cpu_nop_1024x2(void); -void perf_cpu_nop_512x4(void); -void perf_cpu_nop_256x8(void); -/* Parallel execution */ -void perf_cpu_EX_EX(void); -void perf_cpu_MT_MT(void); -void perf_cpu_LS_LS(void); +#include +#include +#include + +GXRAM uint32_t cpu_perf_xram_buffer[512]; /* Is subtracted from result times if specified; in TMU units (prof.elapsed) */ static uint32_t baseline_ticks = 0; @@ -39,14 +36,22 @@ uint32_t Iphi_cycles(void (*function)(void)) /* Number of CPU cycles per iteration; the number of iterations must obviously match assembler code for that test */ -float Iphi_per_iteration(void (*function)(void), int count) +int Iphi_cycles_per_iteration(int total, int count) { - return (float)Iphi_cycles(function) / count; + div_t d = div(total, count); + + if(d.rem < 128) + return d.quot; + if(d.rem > count - 128) + return d.quot + 1; + + return -1; } /* Number of TMU cycles for an empty function */ uint32_t TMU_baseline(void) { + void perf_cpu_empty(void); prof_t perf = prof_make(); for(int i = 0; i < 16; i++) @@ -61,62 +66,132 @@ uint32_t TMU_baseline(void) //--- +struct results { + int nop_2048x1, nop_1024x2, nop_512x4, nop_256x8; + int EX_EX, MT_MT, LS_LS; + int align_4, align_2; + int pipeline_1, pipeline_2, pipeline_3; + int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX; + int noraw_LS_LS, noraw_LS_EX; + int raw_EX_LS_addr, raw_DSPLS_DSPLS; + int darken_1, darken_2, darken_3, darken_4; + int double_read, double_incr_read; + int tex2d; +}; + +/* Number of Iphi cycles total, and number of iterations */ +static struct results r_cycles, r_iter; + +static void table_gen(gtable *t, int row) +{ + static char const *names[] = { + "Single nop", "2 nop", "4 nop", "8 nop", + "EX/EX pair", "MT/MT pair", "LS/LS pair", + "4-aligned parallel pair", "2-aligned parallel pair", + "mac.w/nop pipeline", "mac.w/mac.w pipeline", + "mac.w/nop*5 pipeline", + "RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS", + "RAW dep.: LS/EX", + "No dep.: LS/LS", "No dep.: LS/EX", + "RAW on address: EX/LS", + "RAW dep.: DSP-LS/DSP-LS", + "32-bit VRAM darken #1", "32-bit VRAM darken #2", + "Interwoven darken", "Interwoven open darken", + "Double read", "Double increment read", + "Texture2D shader", + }; + + int cycles = ((int *)&r_cycles)[row]; + int iter = ((int *)&r_iter)[row]; + int cpi = Iphi_cycles_per_iteration(cycles, iter); + + char c2[16], c3[16], c4[16]; + sprintf(c2, "%d", cpi); + sprintf(c3, "%d", cycles); + sprintf(c4, "%d", iter); + + gtable_provide(t, names[row], (cpi == -1 ? "-" : c2), c3, c4); +} + void gintctl_perf_cpu(void) { + memset(&r_cycles, 0, sizeof r_cycles); + memset(&r_iter, 0, sizeof r_iter); + + gtable *table = gtable_create(4, table_gen, NULL, NULL); + gtable_set_rows(table, sizeof r_cycles / sizeof(int)); + gtable_set_row_spacing(table, _(1,2)); + gtable_set_column_titles(table, "Name", "CPI", "Cycles", "Iter."); + gtable_set_column_sizes(table, 6, 1, 2, 2); + gtable_set_font(table, _(&font_mini, dfont_default())); + jwidget_set_margin(table, 0, 2, 1, 2); + + gscreen *scr = gscreen_create2("CPU parallelism", &img_opt_perf_cpu, + "CPU instruction parallelism and pipelining", "@RUN;;;;;"); + gscreen_add_tabs(scr, table, table); + jscene_set_focused_widget(scr->scene, table); + int key = 0; + while(key != KEY_EXIT) { + jevent e = jscene_run(scr->scene); - /* Measure baseline time */ - baseline_ticks = TMU_baseline(); + if(e.type == JSCENE_PAINT) { + dclear(C_WHITE); + jscene_render(scr->scene); + dupdate(); + } - uint32_t Iphi_cpu_nop_2048x1 = 0; - uint32_t Iphi_cpu_nop_1024x2 = 0; - uint32_t Iphi_cpu_nop_512x4 = 0; - uint32_t Iphi_cpu_nop_256x8 = 0; + key = 0; + if(e.type == JSCENE_KEY && e.key.type == KEYEV_DOWN) + key = e.key.key; - uint32_t Iphi_cpu_EX_EX = 0; - uint32_t Iphi_cpu_MT_MT = 0; - uint32_t Iphi_cpu_LS_LS = 0; + if(key == KEY_F1) { + baseline_ticks = TMU_baseline(); - while(key != KEY_EXIT) - { - dclear(C_WHITE); + #define run(name, iter) { \ + extern void perf_cpu_ ## name (void); \ + r_cycles.name = Iphi_cycles(perf_cpu_ ## name); \ + r_iter.name = iter; \ + } - #ifdef FXCG50 - row_title("CPU instruction parallelism and pipelining"); + run(nop_2048x1, 2048); + run(nop_1024x2, 1024); + run(nop_512x4, 512); + run(nop_256x8, 256); - row_print(1, 1, "Baseline ticks: %d", - baseline_ticks); - row_print(3, 1, "Iphi cycles for 2048x1 nop: %d", - Iphi_cpu_nop_2048x1); - row_print(4, 1, "Iphi cycles for 1024x2 nop: %d", - Iphi_cpu_nop_1024x2); - row_print(5, 1, "Iphi cycles for 512x4 nop: %d", - Iphi_cpu_nop_512x4); - row_print(6, 1, "Iphi cycles for 256x8 nop: %d", - Iphi_cpu_nop_256x8); - row_print(8, 1, "Iphi cycles for EX/EX: %d", - Iphi_cpu_EX_EX); - row_print(9, 1, "Iphi cycles for MT/MT: %d", - Iphi_cpu_MT_MT); - row_print(10, 1, "Iphi cycles for LS/LS: %d", - Iphi_cpu_LS_LS); + run(EX_EX, 1024); + run(MT_MT, 1024); + run(LS_LS, 1024); - fkey_button(1, "RUN"); - #endif + run(align_4, 1024); + run(align_2, 1024); - dupdate(); - key = getkey().key; + run(pipeline_1, 1024); + run(pipeline_2, 1024); + run(pipeline_3, 1024); - if(key == KEY_F1) - { - Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1); - Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2); - Iphi_cpu_nop_512x4 = Iphi_cycles(perf_cpu_nop_512x4); - Iphi_cpu_nop_256x8 = Iphi_cycles(perf_cpu_nop_256x8); + run(raw_EX_EX, 1024); + run(raw_LS_LS, 1024); + run(raw_EX_LS, 1024); + run(raw_LS_EX, 1024); + run(noraw_LS_LS, 1024); + run(noraw_LS_EX, 1024); + run(raw_EX_LS_addr, 1024); + run(raw_DSPLS_DSPLS, 512); - Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX); - Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT); - Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS); + run(darken_1, 512); + run(darken_2, 512); + run(darken_3, 256); + run(darken_4, 256); + + run(double_read, 1024); + run(double_incr_read, 1024); + + run(tex2d, 512); + + table->widget.update = 1; } } + + gscreen_destroy(scr); }