perf/cpu: add CPU pipeline/superscalar parallelism observations
This commit is contained in:
parent
4e748e3c55
commit
12e78d2897
|
@ -44,6 +44,8 @@ set(SOURCES
|
|||
src/mem/mem.c
|
||||
src/perf/cpucache.c
|
||||
src/perf/cpucache.S
|
||||
src/perf/cpu.c
|
||||
src/perf/cpu.S
|
||||
src/perf/interrupt.c
|
||||
src/perf/libprof.c
|
||||
src/perf/memory.c
|
||||
|
|
|
@ -11,6 +11,9 @@ void gintctl_perf_libprof(void);
|
|||
/* gintctl_perf_cpucache(): CPU speed and cache size */
|
||||
void gintctl_perf_cpucache(void);
|
||||
|
||||
/* gintctl_perf_cpu(): CPU instruction parallelism and pipelining */
|
||||
void gintctl_perf_cpu(void);
|
||||
|
||||
/* gintctl_perf_interrupts(): Interrupt handling */
|
||||
void gintctl_perf_interrupts(void);
|
||||
|
||||
|
|
|
@ -65,6 +65,8 @@ struct menu menu_perf = {
|
|||
|
||||
{ "libprof basics", gintctl_perf_libprof, 0 },
|
||||
{ "CPU and cache", gintctl_perf_cpucache, 0 },
|
||||
{ _("CPU parallelism", "Superscalar and pipeline parallelism"),
|
||||
gintctl_perf_cpu, 0 },
|
||||
{ "Interrupt stress", gintctl_perf_interrupts, 0 },
|
||||
{ "Memory access speed", gintctl_perf_memory, 0 },
|
||||
{ "Rendering functions", gintctl_perf_render, 0 },
|
||||
|
|
403
src/perf/cpu.S
403
src/perf/cpu.S
|
@ -1,12 +1,12 @@
|
|||
# We put all the code in ILRAM to avoid measurement variations caused by code
|
||||
# being fetched from ROM. The ILRAM is ideal for this task because successive
|
||||
# instruction accesses take only 1 cycle (assuming no interference, which there
|
||||
# is none).
|
||||
/* We put all the code in ILRAM to avoid measurement variations caused by code
|
||||
being fetched from ROM. The ILRAM is ideal for this task because successive
|
||||
instruction accesses take only 1 cycle (assuming no interference, which
|
||||
there is none). */
|
||||
.section .ilram
|
||||
|
||||
# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
|
||||
# prologue has an even number of instructions, which results in the loop code
|
||||
# being 4-aligned, which is of extreme importance.
|
||||
/* Test prologue for COUNT iterations (must be a multiple of 256). Note that
|
||||
the prologue has an even number of instructions, which results in the loop
|
||||
code being 4-aligned, which is of extreme importance. */
|
||||
#define PROLOGUE(COUNT) \
|
||||
mov #(COUNT/256), r0 ; \
|
||||
shll8 r0 ; \
|
||||
|
@ -15,7 +15,7 @@
|
|||
ldrc r0 ; \
|
||||
nop
|
||||
|
||||
# Test epilogue
|
||||
/* Test epilogue */
|
||||
#define EPILOGUE() \
|
||||
rts ; \
|
||||
nop
|
||||
|
@ -23,13 +23,16 @@
|
|||
|
||||
/* [Baseline]
|
||||
|
||||
In this first section, we want to establish an approximate cost of the
|
||||
setup, which consists of TMU access for libprof, function calls, and the
|
||||
loop setup for the DSP. */
|
||||
In this first section, we find an approximate cost of the setup, which
|
||||
consists of TMU access for libprof, function calls, and the loop setup for
|
||||
the DSP. This does not include any loop overhead (which is measured later).
|
||||
|
||||
This will often take 3~5 Pϕ/4 ticks, which is not a very precise measure,
|
||||
but helps eliminating noise around tests and bringing cycle counts very
|
||||
close to multiples of the number of iterations. */
|
||||
|
||||
.global _perf_cpu_empty
|
||||
|
||||
# Empty setup (0 iterations), as baseline
|
||||
.align 4
|
||||
_perf_cpu_empty:
|
||||
PROLOGUE(0)
|
||||
|
@ -38,30 +41,33 @@ _perf_cpu_empty:
|
|||
|
||||
/* [Loop control]
|
||||
|
||||
In this section, we want to check whether the DSP repeat system has any
|
||||
added cost per-loop, and we do this by executing the same instructions with
|
||||
a varying number of DSP repeat jumps.
|
||||
Here we establish that the DSP repeat system has no added cost per-loop in
|
||||
favorable situations. That is, the loop is as efficient as if it were
|
||||
unrolled. This is checked by executing the same sequence of instructions
|
||||
with a varying number of DSP jumps between them.
|
||||
|
||||
The DSP jump has no additional cost, which makes testing much simpler by
|
||||
avoiding loop unrolls (that would otherwise be needed to amortize the cost
|
||||
of the jump). In addition, this allows for tighter loops in real-world
|
||||
programs, and tigher code first better in cache. */
|
||||
The fact that the DSP jump has no additional cost is very beneficial for
|
||||
performance measurements, since it means that variations in the size and
|
||||
iteration count of tests has no influence on the results. (Such influence
|
||||
would otherwise need to be amortized by unrolling.)
|
||||
|
||||
The only observed difference is with the first test where the single
|
||||
instruction in the loop cannot be executed in parallel with itself in the
|
||||
next iteration. My guess is that the instruction from the next iteration is
|
||||
not fetched yet from the perspective of CPU logic. */
|
||||
|
||||
.global _perf_cpu_nop_2048x1
|
||||
.global _perf_cpu_nop_1024x2
|
||||
.global _perf_cpu_nop_512x4
|
||||
.global _perf_cpu_nop_256x8
|
||||
|
||||
# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
|
||||
# Parallel execution likely cannot read through the DSP jump for architectural
|
||||
# reasons; my guess is that the next instruction isn't fetched yet.
|
||||
/* nop loop (2048 iterations of 1 nop) -> 2 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_nop_2048x1:
|
||||
PROLOGUE(2048)
|
||||
1: 2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
|
||||
.global _perf_cpu_nop_1024x2
|
||||
|
||||
/* nop loop (1024 iterations of 2 nop) -> 1 cycle /i */
|
||||
.align 4
|
||||
_perf_cpu_nop_1024x2:
|
||||
PROLOGUE(1024)
|
||||
|
@ -69,7 +75,9 @@ _perf_cpu_nop_1024x2:
|
|||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (512 iterations of 4 nop) -> 2 cycles /i
|
||||
.global _perf_cpu_nop_512x4
|
||||
|
||||
/* nop loop (512 iterations of 4 nop) -> 2 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_nop_512x4:
|
||||
PROLOGUE(512)
|
||||
|
@ -79,7 +87,9 @@ _perf_cpu_nop_512x4:
|
|||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
# nop loop (256 iterations of 8 nop) -> 4 cycles/i
|
||||
.global _perf_cpu_nop_256x8
|
||||
|
||||
/* nop loop (256 iterations of 8 nop) -> 4 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_nop_256x8:
|
||||
PROLOGUE(256)
|
||||
|
@ -100,10 +110,8 @@ _perf_cpu_nop_256x8:
|
|||
pipelines with no extra cycles. */
|
||||
|
||||
.global _perf_cpu_EX_EX
|
||||
.global _perf_cpu_MT_MT
|
||||
.global _perf_cpu_LS_LS
|
||||
|
||||
# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
|
||||
/* EX/EX -> 2 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_EX_EX:
|
||||
PROLOGUE(1024)
|
||||
|
@ -111,7 +119,9 @@ _perf_cpu_EX_EX:
|
|||
2: add #0, r1
|
||||
EPILOGUE()
|
||||
|
||||
# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
|
||||
.global _perf_cpu_MT_MT
|
||||
|
||||
/* MT/MT -> 1 cycle /i */
|
||||
.align 4
|
||||
_perf_cpu_MT_MT:
|
||||
PROLOGUE(1024)
|
||||
|
@ -119,10 +129,337 @@ _perf_cpu_MT_MT:
|
|||
2: mov r2, r3
|
||||
EPILOGUE()
|
||||
|
||||
# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
|
||||
.global _perf_cpu_LS_LS
|
||||
|
||||
/* LS/LS -> 2 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_LS_LS:
|
||||
PROLOGUE(1024)
|
||||
1: mov.l @r15, r0
|
||||
2: mov.l @r15, r1
|
||||
EPILOGUE()
|
||||
|
||||
/* [Aligned parallelism]
|
||||
|
||||
Here, we show that instruction pairs that are not aligned on 4-byte
|
||||
boundaries can nonetheless be parallelized. Having an instruction be
|
||||
executed alone because of a lack of parallel-executability with the next one
|
||||
does not prevent the next one from forming a parallel pair of its own with
|
||||
its successor. */
|
||||
|
||||
.global _perf_cpu_align_4
|
||||
|
||||
/* 2 pairs of parallel instructions -> 2 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_align_4:
|
||||
PROLOGUE(1024)
|
||||
1: add #0, r0
|
||||
mov.l @r15, r1
|
||||
add #0, r0
|
||||
2: mov.l @r15, r1
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_align_2
|
||||
|
||||
/* The add/mov.l pair in the middle is parallelized -> 3 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_align_2:
|
||||
PROLOGUE(1024)
|
||||
1: add #0, r0
|
||||
add #0, r1
|
||||
mov.l @r15, r0
|
||||
2: mov.l @r15, r1
|
||||
EPILOGUE()
|
||||
|
||||
/* [Complex pipelines]
|
||||
|
||||
Here we measure the behavior of multi-cycle instructions that have complex
|
||||
pipelines. These test establish that while mac.w occupies one pipeline for 2
|
||||
cycles, a series of nop can continue to run on the second pipeline.
|
||||
|
||||
Even though mac.w has 2 issue cycles and 4 execution cycles, in a sequence
|
||||
of mac.w each instruction will actually take 3 cycles. I believe this is
|
||||
because the WB/M2 stage of the second mac.w has a data dependency on the
|
||||
MS stage of the previous mac.w instruction, which causes a 1-cycle stall.
|
||||
This assumes that there is no forwarding at the output of the multiplier. */
|
||||
|
||||
.global _perf_cpu_pipeline_1
|
||||
|
||||
/* nop executes in parallel with first pipeline of mac.w -> 3 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_pipeline_1:
|
||||
PROLOGUE(1024)
|
||||
mov r15, r0
|
||||
mov r15, r1
|
||||
|
||||
1: mac.w @r0+, @r1+
|
||||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_pipeline_2
|
||||
|
||||
/* Without parallel execution, still 3 cycles per mac.w -> 6 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_pipeline_2:
|
||||
PROLOGUE(1024)
|
||||
mov r15, r0
|
||||
mov r15, r1
|
||||
|
||||
1: mac.w @r0+, @r1+
|
||||
2: mac.w @r0+, @r1+
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_pipeline_3
|
||||
|
||||
/* mac.w/(nop;nop;nop) then nop/nop -> 4 cycles /i */
|
||||
.align 4
|
||||
_perf_cpu_pipeline_3:
|
||||
PROLOGUE(1024)
|
||||
mov r15, r0
|
||||
mov r15, r1
|
||||
|
||||
1: mac.w @r0+, @r1+
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
2: nop
|
||||
EPILOGUE()
|
||||
|
||||
/* [RAW dependencies]
|
||||
|
||||
In this section we establish the delay caused by RAW dependencies in
|
||||
arithmetic and memory access instructions. */
|
||||
|
||||
.global _perf_cpu_raw_EX_EX
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_EX_EX:
|
||||
PROLOGUE(1024)
|
||||
1: add #1, r0
|
||||
2: add #1, r0
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_raw_LS_LS
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_LS_LS:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.l @r4, r0
|
||||
2: mov.l r0, @r4
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_raw_EX_LS
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_EX_LS:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
mov #0, r0
|
||||
|
||||
1: add #1, r0
|
||||
2: mov.l r0, @r4
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_raw_LS_EX
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_LS_EX:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.l @r4, r0
|
||||
2: add #1, r0
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_noraw_LS_LS
|
||||
|
||||
.align 4
|
||||
_perf_cpu_noraw_LS_LS:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.l @r4, r0
|
||||
2: mov.l r1, @r4
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_noraw_LS_EX
|
||||
|
||||
.align 4
|
||||
_perf_cpu_noraw_LS_EX:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.l @r4, r0
|
||||
2: add #1, r1
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_raw_EX_LS_addr
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_EX_LS_addr:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: add #0, r4
|
||||
2: mov.l r0, @r4
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_raw_DSPLS_DSPLS
|
||||
|
||||
.align 4
|
||||
_perf_cpu_raw_DSPLS_DSPLS:
|
||||
PROLOGUE(512)
|
||||
mov.l .buffer, r4
|
||||
mov r4, r5
|
||||
|
||||
1: movs.w @r4, x0
|
||||
2: movs.w x0, @r5
|
||||
EPILOGUE()
|
||||
|
||||
/* [Iteration weaving]
|
||||
|
||||
In this section we analyze how iterations can be woven and opened to improve
|
||||
performance by reducing RAW dependencies. */
|
||||
|
||||
.global _perf_cpu_darken_1
|
||||
|
||||
.align 4
|
||||
_perf_cpu_darken_1:
|
||||
PROLOGUE(512)
|
||||
mov.l .buffer, r4
|
||||
mov r4, r5
|
||||
add #-4, r5
|
||||
nop
|
||||
|
||||
1: mov.l @r4+, r1
|
||||
and r2, r1
|
||||
add #4, r5
|
||||
shlr r1
|
||||
2: mov.l r1, @r5
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_darken_2
|
||||
|
||||
.align 4
|
||||
_perf_cpu_darken_2:
|
||||
PROLOGUE(512)
|
||||
mov.l .buffer, r4
|
||||
mov r4, r5
|
||||
add #-4, r5
|
||||
nop
|
||||
|
||||
1: mov.l @r4+, r1
|
||||
add #4, r5
|
||||
and r2, r1
|
||||
shlr r1
|
||||
2: mov.l r1, @r5
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_darken_3
|
||||
|
||||
.align 4
|
||||
_perf_cpu_darken_3:
|
||||
PROLOGUE(256)
|
||||
mov.l .buffer, r4
|
||||
mov r4, r5
|
||||
add #-8, r5
|
||||
nop
|
||||
|
||||
1: mov.l @r4+, r1
|
||||
add #8, r5
|
||||
mov.l @r4+, r3
|
||||
and r2, r1
|
||||
shlr r1
|
||||
mov.l r1, @r5
|
||||
and r2, r3
|
||||
shlr r3
|
||||
2: mov.l r3, @(4,r5)
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_darken_4
|
||||
|
||||
.align 4
|
||||
_perf_cpu_darken_4:
|
||||
PROLOGUE(256)
|
||||
mov.l .buffer, r4
|
||||
mov r4, r5
|
||||
add #-8, r5
|
||||
mov.l @r4+, r1
|
||||
|
||||
/* Loop starts with r1 loaded, finishes with r1 loaded */
|
||||
1: mov.l @r4+, r3
|
||||
add #8, r5
|
||||
and r2, r1
|
||||
shlr r1
|
||||
mov.l r1, @r5
|
||||
mov.l @r4+, r1
|
||||
and r2, r3
|
||||
shlr r3
|
||||
2: mov.l r3, @(4,r5)
|
||||
EPILOGUE()
|
||||
|
||||
/* [Advanced dependencies]
|
||||
|
||||
This section measures the delay needed to use registers depending on the
|
||||
type of instruction which modifies them. */
|
||||
|
||||
.global _perf_cpu_double_read
|
||||
|
||||
.align 4
|
||||
_perf_cpu_double_read:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.l @r4, r0
|
||||
2: mov.l @r4, r1
|
||||
EPILOGUE()
|
||||
|
||||
.global _perf_cpu_double_incr_read
|
||||
|
||||
.align 4
|
||||
_perf_cpu_double_incr_read:
|
||||
PROLOGUE(1024)
|
||||
mov.l .buffer, r4
|
||||
nop
|
||||
|
||||
1: mov.b @r4+, r0
|
||||
2: mov.b @r4+, r0
|
||||
EPILOGUE()
|
||||
|
||||
/* [2D texture copy]
|
||||
|
||||
This section is used to investigate the performance of the 2D texture shader
|
||||
of azur. */
|
||||
|
||||
.global _perf_cpu_tex2d
|
||||
|
||||
.align 4
|
||||
_perf_cpu_tex2d:
|
||||
PROLOGUE(512)
|
||||
mov.l .buffer2, r3
|
||||
mov r3, r5 /*.buffer, r5 */
|
||||
|
||||
1: movs.l @r3+, x0
|
||||
2: movs.l x0, @r5+
|
||||
EPILOGUE()
|
||||
|
||||
/* XRAM buffer */
|
||||
|
||||
.align 4
|
||||
.buffer:
|
||||
.long _cpu_perf_xram_buffer
|
||||
.buffer2:
|
||||
.long _buffer2
|
||||
|
||||
.section .data
|
||||
_buffer2:
|
||||
.zero 2048
|
||||
|
|
187
src/perf/cpu.c
187
src/perf/cpu.c
|
@ -5,19 +5,16 @@
|
|||
#include <gintctl/perf.h>
|
||||
#include <gintctl/util.h>
|
||||
|
||||
#include <gintctl/widgets/gscreen.h>
|
||||
#include <gintctl/widgets/gtable.h>
|
||||
|
||||
#include <libprof.h>
|
||||
|
||||
/* Baseline */
|
||||
void perf_cpu_empty(void);
|
||||
/* Loop control */
|
||||
void perf_cpu_nop_2048x1(void);
|
||||
void perf_cpu_nop_1024x2(void);
|
||||
void perf_cpu_nop_512x4(void);
|
||||
void perf_cpu_nop_256x8(void);
|
||||
/* Parallel execution */
|
||||
void perf_cpu_EX_EX(void);
|
||||
void perf_cpu_MT_MT(void);
|
||||
void perf_cpu_LS_LS(void);
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
GXRAM uint32_t cpu_perf_xram_buffer[512];
|
||||
|
||||
/* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
|
||||
static uint32_t baseline_ticks = 0;
|
||||
|
@ -39,14 +36,22 @@ uint32_t Iphi_cycles(void (*function)(void))
|
|||
|
||||
/* Number of CPU cycles per iteration; the number of iterations must obviously
|
||||
match assembler code for that test */
|
||||
float Iphi_per_iteration(void (*function)(void), int count)
|
||||
int Iphi_cycles_per_iteration(int total, int count)
|
||||
{
|
||||
return (float)Iphi_cycles(function) / count;
|
||||
div_t d = div(total, count);
|
||||
|
||||
if(d.rem < 128)
|
||||
return d.quot;
|
||||
if(d.rem > count - 128)
|
||||
return d.quot + 1;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Number of TMU cycles for an empty function */
|
||||
uint32_t TMU_baseline(void)
|
||||
{
|
||||
void perf_cpu_empty(void);
|
||||
prof_t perf = prof_make();
|
||||
|
||||
for(int i = 0; i < 16; i++)
|
||||
|
@ -61,62 +66,132 @@ uint32_t TMU_baseline(void)
|
|||
|
||||
//---
|
||||
|
||||
struct results {
|
||||
int nop_2048x1, nop_1024x2, nop_512x4, nop_256x8;
|
||||
int EX_EX, MT_MT, LS_LS;
|
||||
int align_4, align_2;
|
||||
int pipeline_1, pipeline_2, pipeline_3;
|
||||
int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX;
|
||||
int noraw_LS_LS, noraw_LS_EX;
|
||||
int raw_EX_LS_addr, raw_DSPLS_DSPLS;
|
||||
int darken_1, darken_2, darken_3, darken_4;
|
||||
int double_read, double_incr_read;
|
||||
int tex2d;
|
||||
};
|
||||
|
||||
/* Number of Iphi cycles total, and number of iterations */
|
||||
static struct results r_cycles, r_iter;
|
||||
|
||||
static void table_gen(gtable *t, int row)
|
||||
{
|
||||
static char const *names[] = {
|
||||
"Single nop", "2 nop", "4 nop", "8 nop",
|
||||
"EX/EX pair", "MT/MT pair", "LS/LS pair",
|
||||
"4-aligned parallel pair", "2-aligned parallel pair",
|
||||
"mac.w/nop pipeline", "mac.w/mac.w pipeline",
|
||||
"mac.w/nop*5 pipeline",
|
||||
"RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS",
|
||||
"RAW dep.: LS/EX",
|
||||
"No dep.: LS/LS", "No dep.: LS/EX",
|
||||
"RAW on address: EX/LS",
|
||||
"RAW dep.: DSP-LS/DSP-LS",
|
||||
"32-bit VRAM darken #1", "32-bit VRAM darken #2",
|
||||
"Interwoven darken", "Interwoven open darken",
|
||||
"Double read", "Double increment read",
|
||||
"Texture2D shader",
|
||||
};
|
||||
|
||||
int cycles = ((int *)&r_cycles)[row];
|
||||
int iter = ((int *)&r_iter)[row];
|
||||
int cpi = Iphi_cycles_per_iteration(cycles, iter);
|
||||
|
||||
char c2[16], c3[16], c4[16];
|
||||
sprintf(c2, "%d", cpi);
|
||||
sprintf(c3, "%d", cycles);
|
||||
sprintf(c4, "%d", iter);
|
||||
|
||||
gtable_provide(t, names[row], (cpi == -1 ? "-" : c2), c3, c4);
|
||||
}
|
||||
|
||||
void gintctl_perf_cpu(void)
|
||||
{
|
||||
memset(&r_cycles, 0, sizeof r_cycles);
|
||||
memset(&r_iter, 0, sizeof r_iter);
|
||||
|
||||
gtable *table = gtable_create(4, table_gen, NULL, NULL);
|
||||
gtable_set_rows(table, sizeof r_cycles / sizeof(int));
|
||||
gtable_set_row_spacing(table, _(1,2));
|
||||
gtable_set_column_titles(table, "Name", "CPI", "Cycles", "Iter.");
|
||||
gtable_set_column_sizes(table, 6, 1, 2, 2);
|
||||
gtable_set_font(table, _(&font_mini, dfont_default()));
|
||||
jwidget_set_margin(table, 0, 2, 1, 2);
|
||||
|
||||
gscreen *scr = gscreen_create2("CPU parallelism", &img_opt_perf_cpu,
|
||||
"CPU instruction parallelism and pipelining", "@RUN;;;;;");
|
||||
gscreen_add_tabs(scr, table, table);
|
||||
jscene_set_focused_widget(scr->scene, table);
|
||||
|
||||
int key = 0;
|
||||
while(key != KEY_EXIT) {
|
||||
jevent e = jscene_run(scr->scene);
|
||||
|
||||
/* Measure baseline time */
|
||||
baseline_ticks = TMU_baseline();
|
||||
if(e.type == JSCENE_PAINT) {
|
||||
dclear(C_WHITE);
|
||||
jscene_render(scr->scene);
|
||||
dupdate();
|
||||
}
|
||||
|
||||
uint32_t Iphi_cpu_nop_2048x1 = 0;
|
||||
uint32_t Iphi_cpu_nop_1024x2 = 0;
|
||||
uint32_t Iphi_cpu_nop_512x4 = 0;
|
||||
uint32_t Iphi_cpu_nop_256x8 = 0;
|
||||
key = 0;
|
||||
if(e.type == JSCENE_KEY && e.key.type == KEYEV_DOWN)
|
||||
key = e.key.key;
|
||||
|
||||
uint32_t Iphi_cpu_EX_EX = 0;
|
||||
uint32_t Iphi_cpu_MT_MT = 0;
|
||||
uint32_t Iphi_cpu_LS_LS = 0;
|
||||
if(key == KEY_F1) {
|
||||
baseline_ticks = TMU_baseline();
|
||||
|
||||
while(key != KEY_EXIT)
|
||||
{
|
||||
dclear(C_WHITE);
|
||||
#define run(name, iter) { \
|
||||
extern void perf_cpu_ ## name (void); \
|
||||
r_cycles.name = Iphi_cycles(perf_cpu_ ## name); \
|
||||
r_iter.name = iter; \
|
||||
}
|
||||
|
||||
#ifdef FXCG50
|
||||
row_title("CPU instruction parallelism and pipelining");
|
||||
run(nop_2048x1, 2048);
|
||||
run(nop_1024x2, 1024);
|
||||
run(nop_512x4, 512);
|
||||
run(nop_256x8, 256);
|
||||
|
||||
row_print(1, 1, "Baseline ticks: %d",
|
||||
baseline_ticks);
|
||||
row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
|
||||
Iphi_cpu_nop_2048x1);
|
||||
row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
|
||||
Iphi_cpu_nop_1024x2);
|
||||
row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
|
||||
Iphi_cpu_nop_512x4);
|
||||
row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
|
||||
Iphi_cpu_nop_256x8);
|
||||
row_print(8, 1, "Iphi cycles for EX/EX: %d",
|
||||
Iphi_cpu_EX_EX);
|
||||
row_print(9, 1, "Iphi cycles for MT/MT: %d",
|
||||
Iphi_cpu_MT_MT);
|
||||
row_print(10, 1, "Iphi cycles for LS/LS: %d",
|
||||
Iphi_cpu_LS_LS);
|
||||
run(EX_EX, 1024);
|
||||
run(MT_MT, 1024);
|
||||
run(LS_LS, 1024);
|
||||
|
||||
fkey_button(1, "RUN");
|
||||
#endif
|
||||
run(align_4, 1024);
|
||||
run(align_2, 1024);
|
||||
|
||||
dupdate();
|
||||
key = getkey().key;
|
||||
run(pipeline_1, 1024);
|
||||
run(pipeline_2, 1024);
|
||||
run(pipeline_3, 1024);
|
||||
|
||||
if(key == KEY_F1)
|
||||
{
|
||||
Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
|
||||
Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
|
||||
Iphi_cpu_nop_512x4 = Iphi_cycles(perf_cpu_nop_512x4);
|
||||
Iphi_cpu_nop_256x8 = Iphi_cycles(perf_cpu_nop_256x8);
|
||||
run(raw_EX_EX, 1024);
|
||||
run(raw_LS_LS, 1024);
|
||||
run(raw_EX_LS, 1024);
|
||||
run(raw_LS_EX, 1024);
|
||||
run(noraw_LS_LS, 1024);
|
||||
run(noraw_LS_EX, 1024);
|
||||
run(raw_EX_LS_addr, 1024);
|
||||
run(raw_DSPLS_DSPLS, 512);
|
||||
|
||||
Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
|
||||
Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
|
||||
Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
|
||||
run(darken_1, 512);
|
||||
run(darken_2, 512);
|
||||
run(darken_3, 256);
|
||||
run(darken_4, 256);
|
||||
|
||||
run(double_read, 1024);
|
||||
run(double_incr_read, 1024);
|
||||
|
||||
run(tex2d, 512);
|
||||
|
||||
table->widget.update = 1;
|
||||
}
|
||||
}
|
||||
|
||||
gscreen_destroy(scr);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue