diff --git a/src/perf/cpu.S b/src/perf/cpu.S index 0e2ac3b..c36250b 100644 --- a/src/perf/cpu.S +++ b/src/perf/cpu.S @@ -233,6 +233,7 @@ _perf_cpu_pipeline_3: .global _perf_cpu_raw_EX_EX +/* Forwarding after the ALU is seamless, no delay -> 2 cycles /i */ .align 4 _perf_cpu_raw_EX_EX: PROLOGUE(1024) @@ -242,6 +243,8 @@ _perf_cpu_raw_EX_EX: .global _perf_cpu_raw_LS_LS +/* Value is available immediately for memory... at a *different address* (the + same addresse would give 4 cycles /i) -> 2 cycles /i */ .align 4 _perf_cpu_raw_LS_LS: PROLOGUE(1024) @@ -249,11 +252,12 @@ _perf_cpu_raw_LS_LS: nop 1: mov.l @r4, r0 -2: mov.l r0, @r4 +2: mov.l r0, @(4,r4) EPILOGUE() .global _perf_cpu_raw_EX_LS +/* Perfect forwarding from ALU to memory access -> 1 cycle /i */ .align 4 _perf_cpu_raw_EX_LS: PROLOGUE(1024) @@ -266,30 +270,40 @@ _perf_cpu_raw_EX_LS: .global _perf_cpu_raw_LS_EX +/* 1-cycle stall after loading a register from memory -> 3 cycles /i */ .align 4 _perf_cpu_raw_LS_EX: PROLOGUE(1024) - mov.l .buffer, r4 - nop - -1: mov.l @r4, r0 +1: mov.l @r15, r0 2: add #1, r0 EPILOGUE() +.global _perf_cpu_raw_LS_MT + +/* Same - it's not like you could move to avoid the stall -> 3 cycles /i */ +.align 4 +_perf_cpu_raw_LS_MT: + PROLOGUE(1024) +1: mov.l @r15, r0 +2: mov r0, r1 + EPILOGUE() + .global _perf_cpu_noraw_LS_LS +/* Still efficient as long as the addresses are different -> 2 cycles /i */ .align 4 _perf_cpu_noraw_LS_LS: PROLOGUE(1024) mov.l .buffer, r4 - nop + mov r4, r5 1: mov.l @r4, r0 -2: mov.l r1, @r4 +2: mov.l r1, @(4,r5) EPILOGUE() .global _perf_cpu_noraw_LS_EX +/* Normal superscalar parallelism at work -> 1 cycle /i */ .align 4 _perf_cpu_noraw_LS_EX: PROLOGUE(1024) @@ -302,6 +316,8 @@ _perf_cpu_noraw_LS_EX: .global _perf_cpu_raw_EX_LS_addr +/* There is no forwarding on the address, so similar to loading this actually + takes much longer than when modifying the operand -> 3 cycles /i */ .align 4 _perf_cpu_raw_EX_LS_addr: PROLOGUE(1024) @@ -312,13 +328,29 @@ _perf_cpu_raw_EX_LS_addr: 2: mov.l r0, @r4 EPILOGUE() +.global _perf_cpu_raw_LS_LS_addr + +/* The worst of all; 2-cycle stall to use a loaded address -> 4 cycles /i */ +.align 4 +_perf_cpu_raw_LS_LS_addr: + PROLOGUE(1024) + mov.l .buffer, r4 + mov.l r15, @r4 + +1: mov.l @r4, r5 +2: mov.l @r5, r6 + EPILOGUE() + .global _perf_cpu_raw_DSPLS_DSPLS +/* As previously, the addresses must be different -> 2 cycles /i */ .align 4 _perf_cpu_raw_DSPLS_DSPLS: PROLOGUE(512) mov.l .buffer, r4 mov r4, r5 + add #2, r5 + nop 1: movs.w @r4, x0 2: movs.w x0, @r5 @@ -327,10 +359,14 @@ _perf_cpu_raw_DSPLS_DSPLS: /* [Iteration weaving] In this section we analyze how iterations can be woven and opened to improve - performance by reducing RAW dependencies. */ + performance by reducing RAW dependencies. This is illustrated with a + function that darkens a continuous section of VRAM. The initial version + takes 3 cycles /pixel, whereas the optimized takes 1.25 cycle /pixel. */ .global _perf_cpu_darken_1 +/* Darkening RGB565 by (color = (color & 0xf7de) >> 1). This base version does + two pixels at a time but has pretty complex RAWs -> 6 cycles /i */ .align 4 _perf_cpu_darken_1: PROLOGUE(512) @@ -340,14 +376,17 @@ _perf_cpu_darken_1: nop 1: mov.l @r4+, r1 + /* Stall because of loading r1 */ and r2, r1 add #4, r5 shlr r1 + /* Stall because of access to r5 as address */ 2: mov.l r1, @r5 EPILOGUE() .global _perf_cpu_darken_2 +/* Here the change to r5 is moved to eliminate both stalls -> 4 cycles /i */ .align 4 _perf_cpu_darken_2: PROLOGUE(512) @@ -359,12 +398,17 @@ _perf_cpu_darken_2: 1: mov.l @r4+, r1 add #4, r5 and r2, r1 + /* The EX/LS pair below is the only one parallelized */ shlr r1 2: mov.l r1, @r5 EPILOGUE() .global _perf_cpu_darken_3 +/* Here iterations are woven together to increase the amount of independent + data. Each iteration processes twice as much data and uses EX cycles of the + first longword to do LS work on the second one. Plus r5 is incremented only + once -> 6 cycles /i */ .align 4 _perf_cpu_darken_3: PROLOGUE(256) @@ -375,17 +419,25 @@ _perf_cpu_darken_3: 1: mov.l @r4+, r1 add #8, r5 + mov.l @r4+, r3 + and r2, r1 + shlr r1 mov.l r1, @r5 + and r2, r3 + shlr r3 2: mov.l r3, @(4,r5) EPILOGUE() .global _perf_cpu_darken_4 +/* Finally iterations are opened here to eliminate the long chain of dependency + from the loads to the stores. Late EX instructions are parallelized with + loads for the next iteration -> 5 cycles/i */ .align 4 _perf_cpu_darken_4: PROLOGUE(256) @@ -413,6 +465,7 @@ _perf_cpu_darken_4: .global _perf_cpu_double_read +/* No problem here -> 2 cycles /i */ .align 4 _perf_cpu_double_read: PROLOGUE(1024) @@ -425,6 +478,7 @@ _perf_cpu_double_read: .global _perf_cpu_double_incr_read +/* Post-increment feeds into address much faster than ALU -> 2 cycles /i */ .align 4 _perf_cpu_double_incr_read: PROLOGUE(1024) @@ -435,6 +489,21 @@ _perf_cpu_double_incr_read: 2: mov.b @r4+, r0 EPILOGUE() +.global _perf_cpu_double_write + +/* No delay writing twice, whether with r4/r4 or r4/r5 -> 2 cycles/i */ +.align 4 +_perf_cpu_double_write: + PROLOGUE(1024) + mov.l .buffer, r4 + mov.l @r4, r0 + mov r0, r1 + mov r4, r5 + +1: mov.l r0, @r4 +2: mov.l r1, @r5 + EPILOGUE() + /* [2D texture copy] This section is used to investigate the performance of the 2D texture shader diff --git a/src/perf/cpu.c b/src/perf/cpu.c index 28f012c..2d55b1f 100644 --- a/src/perf/cpu.c +++ b/src/perf/cpu.c @@ -41,9 +41,9 @@ int Iphi_cycles_per_iteration(int total, int count) { div_t d = div(total, count); - if(d.rem < 128) + if(d.rem < 192) return d.quot; - if(d.rem > count - 128) + if(d.rem > count - 192) return d.quot + 1; return -1; @@ -72,11 +72,11 @@ struct results { int EX_EX, MT_MT, LS_LS; int align_4, align_2; int pipeline_1, pipeline_2, pipeline_3; - int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX; + int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX, raw_LS_MT; int noraw_LS_LS, noraw_LS_EX; - int raw_EX_LS_addr, raw_DSPLS_DSPLS; + int raw_EX_LS_addr, raw_LS_LS_addr, raw_DSPLS_DSPLS; int darken_1, darken_2, darken_3, darken_4; - int double_read, double_incr_read; + int double_read, double_incr_read, double_write; #ifdef FXCG50 int tex2d; #endif @@ -94,13 +94,13 @@ static void table_gen(gtable *t, int row) "mac.w/nop pipeline", "mac.w/mac.w pipeline", "mac.w/nop*5 pipeline", "RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS", - "RAW dep.: LS/EX", + "RAW dep.: LS/EX", "RAW dep.: LS/MT", "No dep.: LS/LS", "No dep.: LS/EX", - "RAW on address: EX/LS", + "RAW on address: EX/LS", "RAW on address: LS/LS", "RAW dep.: DSP-LS/DSP-LS", "32-bit VRAM darken #1", "32-bit VRAM darken #2", "Interwoven darken", "Interwoven open darken", - "Double read", "Double increment read", + "Double read", "Double increment read", "Double write", "Texture2D shader", }; @@ -177,9 +177,11 @@ void gintctl_perf_cpu(void) run(raw_LS_LS, 1024); run(raw_EX_LS, 1024); run(raw_LS_EX, 1024); + run(raw_LS_MT, 1024); run(noraw_LS_LS, 1024); run(noraw_LS_EX, 1024); run(raw_EX_LS_addr, 1024); + run(raw_LS_LS_addr, 1024); run(raw_DSPLS_DSPLS, 512); run(darken_1, 512); @@ -189,6 +191,7 @@ void gintctl_perf_cpu(void) run(double_read, 1024); run(double_incr_read, 1024); + run(double_write, 1024); #ifdef FXCG50 run(tex2d, 512);