perf/cpu: more documentation on pipeline stalls

This commit is contained in:
Lephenixnoir 2022-03-24 18:19:53 +00:00
parent 29fc91d60c
commit d37e2bb82f
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
2 changed files with 88 additions and 16 deletions

View File

@ -233,6 +233,7 @@ _perf_cpu_pipeline_3:
.global _perf_cpu_raw_EX_EX
/* Forwarding after the ALU is seamless, no delay -> 2 cycles /i */
.align 4
_perf_cpu_raw_EX_EX:
PROLOGUE(1024)
@ -242,6 +243,8 @@ _perf_cpu_raw_EX_EX:
.global _perf_cpu_raw_LS_LS
/* Value is available immediately for memory... at a *different address* (the
same addresse would give 4 cycles /i) -> 2 cycles /i */
.align 4
_perf_cpu_raw_LS_LS:
PROLOGUE(1024)
@ -249,11 +252,12 @@ _perf_cpu_raw_LS_LS:
nop
1: mov.l @r4, r0
2: mov.l r0, @r4
2: mov.l r0, @(4,r4)
EPILOGUE()
.global _perf_cpu_raw_EX_LS
/* Perfect forwarding from ALU to memory access -> 1 cycle /i */
.align 4
_perf_cpu_raw_EX_LS:
PROLOGUE(1024)
@ -266,30 +270,40 @@ _perf_cpu_raw_EX_LS:
.global _perf_cpu_raw_LS_EX
/* 1-cycle stall after loading a register from memory -> 3 cycles /i */
.align 4
_perf_cpu_raw_LS_EX:
PROLOGUE(1024)
mov.l .buffer, r4
nop
1: mov.l @r4, r0
1: mov.l @r15, r0
2: add #1, r0
EPILOGUE()
.global _perf_cpu_raw_LS_MT
/* Same - it's not like you could move to avoid the stall -> 3 cycles /i */
.align 4
_perf_cpu_raw_LS_MT:
PROLOGUE(1024)
1: mov.l @r15, r0
2: mov r0, r1
EPILOGUE()
.global _perf_cpu_noraw_LS_LS
/* Still efficient as long as the addresses are different -> 2 cycles /i */
.align 4
_perf_cpu_noraw_LS_LS:
PROLOGUE(1024)
mov.l .buffer, r4
nop
mov r4, r5
1: mov.l @r4, r0
2: mov.l r1, @r4
2: mov.l r1, @(4,r5)
EPILOGUE()
.global _perf_cpu_noraw_LS_EX
/* Normal superscalar parallelism at work -> 1 cycle /i */
.align 4
_perf_cpu_noraw_LS_EX:
PROLOGUE(1024)
@ -302,6 +316,8 @@ _perf_cpu_noraw_LS_EX:
.global _perf_cpu_raw_EX_LS_addr
/* There is no forwarding on the address, so similar to loading this actually
takes much longer than when modifying the operand -> 3 cycles /i */
.align 4
_perf_cpu_raw_EX_LS_addr:
PROLOGUE(1024)
@ -312,13 +328,29 @@ _perf_cpu_raw_EX_LS_addr:
2: mov.l r0, @r4
EPILOGUE()
.global _perf_cpu_raw_LS_LS_addr
/* The worst of all; 2-cycle stall to use a loaded address -> 4 cycles /i */
.align 4
_perf_cpu_raw_LS_LS_addr:
PROLOGUE(1024)
mov.l .buffer, r4
mov.l r15, @r4
1: mov.l @r4, r5
2: mov.l @r5, r6
EPILOGUE()
.global _perf_cpu_raw_DSPLS_DSPLS
/* As previously, the addresses must be different -> 2 cycles /i */
.align 4
_perf_cpu_raw_DSPLS_DSPLS:
PROLOGUE(512)
mov.l .buffer, r4
mov r4, r5
add #2, r5
nop
1: movs.w @r4, x0
2: movs.w x0, @r5
@ -327,10 +359,14 @@ _perf_cpu_raw_DSPLS_DSPLS:
/* [Iteration weaving]
In this section we analyze how iterations can be woven and opened to improve
performance by reducing RAW dependencies. */
performance by reducing RAW dependencies. This is illustrated with a
function that darkens a continuous section of VRAM. The initial version
takes 3 cycles /pixel, whereas the optimized takes 1.25 cycle /pixel. */
.global _perf_cpu_darken_1
/* Darkening RGB565 by (color = (color & 0xf7de) >> 1). This base version does
two pixels at a time but has pretty complex RAWs -> 6 cycles /i */
.align 4
_perf_cpu_darken_1:
PROLOGUE(512)
@ -340,14 +376,17 @@ _perf_cpu_darken_1:
nop
1: mov.l @r4+, r1
/* Stall because of loading r1 */
and r2, r1
add #4, r5
shlr r1
/* Stall because of access to r5 as address */
2: mov.l r1, @r5
EPILOGUE()
.global _perf_cpu_darken_2
/* Here the change to r5 is moved to eliminate both stalls -> 4 cycles /i */
.align 4
_perf_cpu_darken_2:
PROLOGUE(512)
@ -359,12 +398,17 @@ _perf_cpu_darken_2:
1: mov.l @r4+, r1
add #4, r5
and r2, r1
/* The EX/LS pair below is the only one parallelized */
shlr r1
2: mov.l r1, @r5
EPILOGUE()
.global _perf_cpu_darken_3
/* Here iterations are woven together to increase the amount of independent
data. Each iteration processes twice as much data and uses EX cycles of the
first longword to do LS work on the second one. Plus r5 is incremented only
once -> 6 cycles /i */
.align 4
_perf_cpu_darken_3:
PROLOGUE(256)
@ -375,17 +419,25 @@ _perf_cpu_darken_3:
1: mov.l @r4+, r1
add #8, r5
mov.l @r4+, r3
and r2, r1
shlr r1
mov.l r1, @r5
and r2, r3
shlr r3
2: mov.l r3, @(4,r5)
EPILOGUE()
.global _perf_cpu_darken_4
/* Finally iterations are opened here to eliminate the long chain of dependency
from the loads to the stores. Late EX instructions are parallelized with
loads for the next iteration -> 5 cycles/i */
.align 4
_perf_cpu_darken_4:
PROLOGUE(256)
@ -413,6 +465,7 @@ _perf_cpu_darken_4:
.global _perf_cpu_double_read
/* No problem here -> 2 cycles /i */
.align 4
_perf_cpu_double_read:
PROLOGUE(1024)
@ -425,6 +478,7 @@ _perf_cpu_double_read:
.global _perf_cpu_double_incr_read
/* Post-increment feeds into address much faster than ALU -> 2 cycles /i */
.align 4
_perf_cpu_double_incr_read:
PROLOGUE(1024)
@ -435,6 +489,21 @@ _perf_cpu_double_incr_read:
2: mov.b @r4+, r0
EPILOGUE()
.global _perf_cpu_double_write
/* No delay writing twice, whether with r4/r4 or r4/r5 -> 2 cycles/i */
.align 4
_perf_cpu_double_write:
PROLOGUE(1024)
mov.l .buffer, r4
mov.l @r4, r0
mov r0, r1
mov r4, r5
1: mov.l r0, @r4
2: mov.l r1, @r5
EPILOGUE()
/* [2D texture copy]
This section is used to investigate the performance of the 2D texture shader

View File

@ -41,9 +41,9 @@ int Iphi_cycles_per_iteration(int total, int count)
{
div_t d = div(total, count);
if(d.rem < 128)
if(d.rem < 192)
return d.quot;
if(d.rem > count - 128)
if(d.rem > count - 192)
return d.quot + 1;
return -1;
@ -72,11 +72,11 @@ struct results {
int EX_EX, MT_MT, LS_LS;
int align_4, align_2;
int pipeline_1, pipeline_2, pipeline_3;
int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX;
int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX, raw_LS_MT;
int noraw_LS_LS, noraw_LS_EX;
int raw_EX_LS_addr, raw_DSPLS_DSPLS;
int raw_EX_LS_addr, raw_LS_LS_addr, raw_DSPLS_DSPLS;
int darken_1, darken_2, darken_3, darken_4;
int double_read, double_incr_read;
int double_read, double_incr_read, double_write;
#ifdef FXCG50
int tex2d;
#endif
@ -94,13 +94,13 @@ static void table_gen(gtable *t, int row)
"mac.w/nop pipeline", "mac.w/mac.w pipeline",
"mac.w/nop*5 pipeline",
"RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS",
"RAW dep.: LS/EX",
"RAW dep.: LS/EX", "RAW dep.: LS/MT",
"No dep.: LS/LS", "No dep.: LS/EX",
"RAW on address: EX/LS",
"RAW on address: EX/LS", "RAW on address: LS/LS",
"RAW dep.: DSP-LS/DSP-LS",
"32-bit VRAM darken #1", "32-bit VRAM darken #2",
"Interwoven darken", "Interwoven open darken",
"Double read", "Double increment read",
"Double read", "Double increment read", "Double write",
"Texture2D shader",
};
@ -177,9 +177,11 @@ void gintctl_perf_cpu(void)
run(raw_LS_LS, 1024);
run(raw_EX_LS, 1024);
run(raw_LS_EX, 1024);
run(raw_LS_MT, 1024);
run(noraw_LS_LS, 1024);
run(noraw_LS_EX, 1024);
run(raw_EX_LS_addr, 1024);
run(raw_LS_LS_addr, 1024);
run(raw_DSPLS_DSPLS, 512);
run(darken_1, 512);
@ -189,6 +191,7 @@ void gintctl_perf_cpu(void)
run(double_read, 1024);
run(double_incr_read, 1024);
run(double_write, 1024);
#ifdef FXCG50
run(tex2d, 512);