From 12e78d2897252aaf1d3ccfaba16f79f43b9cfa7d Mon Sep 17 00:00:00 2001
From: Lephenixnoir <sebastien.michelland@protonmail.com>
Date: Thu, 5 Aug 2021 16:12:36 +0200
Subject: [PATCH] perf/cpu: add CPU pipeline/superscalar parallelism
 observations

---
 CMakeLists.txt         |   2 +
 include/gintctl/perf.h |   3 +
 src/gintctl.c          |   2 +
 src/perf/cpu.S         | 403 +++++++++++++++++++++++++++++++++++++----
 src/perf/cpu.c         | 187 +++++++++++++------
 5 files changed, 508 insertions(+), 89 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 524d8d9..8fc8fbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,8 @@ set(SOURCES
   src/mem/mem.c
   src/perf/cpucache.c
   src/perf/cpucache.S
+  src/perf/cpu.c
+  src/perf/cpu.S
   src/perf/interrupt.c
   src/perf/libprof.c
   src/perf/memory.c
diff --git a/include/gintctl/perf.h b/include/gintctl/perf.h
index d475730..1903044 100644
--- a/include/gintctl/perf.h
+++ b/include/gintctl/perf.h
@@ -11,6 +11,9 @@ void gintctl_perf_libprof(void);
 /* gintctl_perf_cpucache(): CPU speed and cache size */
 void gintctl_perf_cpucache(void);
 
+/* gintctl_perf_cpu(): CPU instruction parallelism and pipelining */
+void gintctl_perf_cpu(void);
+
 /* gintctl_perf_interrupts(): Interrupt handling */
 void gintctl_perf_interrupts(void);
 
diff --git a/src/gintctl.c b/src/gintctl.c
index ac44801..2a66104 100644
--- a/src/gintctl.c
+++ b/src/gintctl.c
@@ -65,6 +65,8 @@ struct menu menu_perf = {
 
 	{ "libprof basics",      gintctl_perf_libprof, 0 },
 	{ "CPU and cache",       gintctl_perf_cpucache, 0 },
+	{ _("CPU parallelism", "Superscalar and pipeline parallelism"),
+	                         gintctl_perf_cpu, 0 },
 	{ "Interrupt stress",    gintctl_perf_interrupts, 0 },
 	{ "Memory access speed", gintctl_perf_memory, 0 },
 	{ "Rendering functions", gintctl_perf_render, 0 },
diff --git a/src/perf/cpu.S b/src/perf/cpu.S
index a7afa6b..67a97db 100644
--- a/src/perf/cpu.S
+++ b/src/perf/cpu.S
@@ -1,12 +1,12 @@
-# We put all the code in ILRAM to avoid measurement variations caused by code
-# being fetched from ROM. The ILRAM is ideal for this task because successive
-# instruction accesses take only 1 cycle (assuming no interference, which there
-# is none).
+/* We put all the code in ILRAM to avoid measurement variations caused by code
+   being fetched from ROM. The ILRAM is ideal for this task because successive
+   instruction accesses take only 1 cycle (assuming no interference, which
+   there is none). */
 .section .ilram
 
-# Test prologue for COUNT iterations (must be a multiple of 256). Note that the
-# prologue has an even number of instructions, which results in the loop code
-# being 4-aligned, which is of extreme importance.
+/* Test prologue for COUNT iterations (must be a multiple of 256). Note that
+   the prologue has an even number of instructions, which results in the loop
+   code being 4-aligned, which is of extreme importance. */
 #define PROLOGUE(COUNT) 		\
 	mov	#(COUNT/256), r0 ;	\
 	shll8	r0 ;			\
@@ -15,7 +15,7 @@
 	ldrc	r0 ;			\
 	nop
 
-# Test epilogue
+/* Test epilogue */
 #define EPILOGUE()			\
 	rts ;				\
 	nop
@@ -23,13 +23,16 @@
 
 /* [Baseline]
 
-   In this first section, we want to establish an approximate cost of the
-   setup, which consists of TMU access for libprof, function calls, and the
-   loop setup for the DSP. */
+   In this first section, we find an approximate cost of the setup, which
+   consists of TMU access for libprof, function calls, and the loop setup for
+   the DSP. This does not include any loop overhead (which is measured later).
+
+   This will often take 3~5 Pϕ/4 ticks, which is not a very precise measure,
+   but helps eliminating noise around tests and bringing cycle counts very
+   close to multiples of the number of iterations. */
 
 .global _perf_cpu_empty
 
-# Empty setup (0 iterations), as baseline
 .align 4
 _perf_cpu_empty:
 	PROLOGUE(0)
@@ -38,30 +41,33 @@ _perf_cpu_empty:
 
 /* [Loop control]
 
-   In this section, we want to check whether the DSP repeat system has any
-   added cost per-loop, and we do this by executing the same instructions with
-   a varying number of DSP repeat jumps.
+   Here we establish that the DSP repeat system has no added cost per-loop in
+   favorable situations. That is, the loop is as efficient as if it were
+   unrolled. This is checked by executing the same sequence of instructions
+   with a varying number of DSP jumps between them.
 
-   The DSP jump has no additional cost, which makes testing much simpler by
-   avoiding loop unrolls (that would otherwise be needed to amortize the cost
-   of the jump). In addition, this allows for tighter loops in real-world
-   programs, and tigher code first better in cache. */
+   The fact that the DSP jump has no additional cost is very beneficial for
+   performance measurements, since it means that variations in the size and
+   iteration count of tests has no influence on the results. (Such influence
+   would otherwise need to be amortized by unrolling.)
+
+   The only observed difference is with the first test where the single
+   instruction in the loop cannot be executed in parallel with itself in the
+   next iteration. My guess is that the instruction from the next iteration is
+   not fetched yet from the perspective of CPU logic. */
 
 .global _perf_cpu_nop_2048x1
-.global _perf_cpu_nop_1024x2
-.global _perf_cpu_nop_512x4
-.global _perf_cpu_nop_256x8
 
-# nop loop (2048 iterations of 1 nop) -> 2 cycles /i
-# Parallel execution likely cannot read through the DSP jump for architectural
-# reasons; my guess is that the next instruction isn't fetched yet.
+/* nop loop (2048 iterations of 1 nop) -> 2 cycles /i */
 .align 4
 _perf_cpu_nop_2048x1:
 	PROLOGUE(2048)
 1: 2: 	nop
 	EPILOGUE()
 
-# nop loop (1024 iterations of 2 nop) -> 1 cycle /i
+.global _perf_cpu_nop_1024x2
+
+/* nop loop (1024 iterations of 2 nop) -> 1 cycle /i */
 .align 4
 _perf_cpu_nop_1024x2:
 	PROLOGUE(1024)
@@ -69,7 +75,9 @@ _perf_cpu_nop_1024x2:
 2:	nop
 	EPILOGUE()
 
-# nop loop (512 iterations of 4 nop) -> 2 cycles /i
+.global _perf_cpu_nop_512x4
+
+/* nop loop (512 iterations of 4 nop) -> 2 cycles /i */
 .align 4
 _perf_cpu_nop_512x4:
 	PROLOGUE(512)
@@ -79,7 +87,9 @@ _perf_cpu_nop_512x4:
 2:	nop
 	EPILOGUE()
 
-# nop loop (256 iterations of 8 nop) -> 4 cycles/i
+.global _perf_cpu_nop_256x8
+
+/* nop loop (256 iterations of 8 nop) -> 4 cycles /i */
 .align 4
 _perf_cpu_nop_256x8:
 	PROLOGUE(256)
@@ -100,10 +110,8 @@ _perf_cpu_nop_256x8:
    pipelines with no extra cycles. */
 
 .global _perf_cpu_EX_EX
-.global _perf_cpu_MT_MT
-.global _perf_cpu_LS_LS
 
-# EX/EX (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
+/* EX/EX -> 2 cycles /i */
 .align 4
 _perf_cpu_EX_EX:
 	PROLOGUE(1024)
@@ -111,7 +119,9 @@ _perf_cpu_EX_EX:
 2:	add	#0, r1
 	EPILOGUE()
 
-# MT/MT (1024 iterations of 2 parallel instructions) -> 1 cycle /i
+.global _perf_cpu_MT_MT
+
+/* MT/MT -> 1 cycle /i */
 .align 4
 _perf_cpu_MT_MT:
 	PROLOGUE(1024)
@@ -119,10 +129,337 @@ _perf_cpu_MT_MT:
 2:	mov	r2, r3
 	EPILOGUE()
 
-# LS/LS (1024 iterations of 2 non-parallel instructions) -> 2 cycles /i
+.global _perf_cpu_LS_LS
+
+/* LS/LS -> 2 cycles /i */
 .align 4
 _perf_cpu_LS_LS:
 	PROLOGUE(1024)
 1:	mov.l	@r15, r0
 2:	mov.l	@r15, r1
 	EPILOGUE()
+
+/* [Aligned parallelism]
+
+   Here, we show that instruction pairs that are not aligned on 4-byte
+   boundaries can nonetheless be parallelized. Having an instruction be
+   executed alone because of a lack of parallel-executability with the next one
+   does not prevent the next one from forming a parallel pair of its own with
+   its successor. */
+
+.global _perf_cpu_align_4
+
+/* 2 pairs of parallel instructions -> 2 cycles /i */
+.align 4
+_perf_cpu_align_4:
+	PROLOGUE(1024)
+1:	add	#0, r0
+	mov.l	@r15, r1
+	add	#0, r0
+2:	mov.l	@r15, r1
+	EPILOGUE()
+
+.global _perf_cpu_align_2
+
+/* The add/mov.l pair in the middle is parallelized -> 3 cycles /i */
+.align 4
+_perf_cpu_align_2:
+	PROLOGUE(1024)
+1:	add	#0, r0
+	add	#0, r1
+	mov.l	@r15, r0
+2:	mov.l	@r15, r1
+	EPILOGUE()
+
+/* [Complex pipelines]
+
+   Here we measure the behavior of multi-cycle instructions that have complex
+   pipelines. These test establish that while mac.w occupies one pipeline for 2
+   cycles, a series of nop can continue to run on the second pipeline.
+
+   Even though mac.w has 2 issue cycles and 4 execution cycles, in a sequence
+   of mac.w each instruction will actually take 3 cycles. I believe this is
+   because the WB/M2 stage of the second mac.w has a data dependency on the
+   MS stage of the previous mac.w instruction, which causes a 1-cycle stall.
+   This assumes that there is no forwarding at the output of the multiplier. */
+
+.global _perf_cpu_pipeline_1
+
+/* nop executes in parallel with first pipeline of mac.w -> 3 cycles /i */
+.align 4
+_perf_cpu_pipeline_1:
+	PROLOGUE(1024)
+	mov	r15, r0
+	mov	r15, r1
+
+1:	mac.w	@r0+, @r1+
+2:	nop
+	EPILOGUE()
+
+.global _perf_cpu_pipeline_2
+
+/* Without parallel execution, still 3 cycles per mac.w -> 6 cycles /i */
+.align 4
+_perf_cpu_pipeline_2:
+	PROLOGUE(1024)
+	mov	r15, r0
+	mov	r15, r1
+
+1:	mac.w	@r0+, @r1+
+2:	mac.w	@r0+, @r1+
+	EPILOGUE()
+
+.global _perf_cpu_pipeline_3
+
+/* mac.w/(nop;nop;nop) then nop/nop -> 4 cycles /i */
+.align 4
+_perf_cpu_pipeline_3:
+	PROLOGUE(1024)
+	mov	r15, r0
+	mov	r15, r1
+
+1:	mac.w	@r0+, @r1+
+	nop
+	nop
+	nop
+	nop
+2:	nop
+	EPILOGUE()
+
+/* [RAW dependencies]
+
+   In this section we establish the delay caused by RAW dependencies in
+   arithmetic and memory access instructions. */
+
+.global _perf_cpu_raw_EX_EX
+
+.align 4
+_perf_cpu_raw_EX_EX:
+	PROLOGUE(1024)
+1:	add	#1, r0
+2:	add	#1, r0
+	EPILOGUE()
+
+.global _perf_cpu_raw_LS_LS
+
+.align 4
+_perf_cpu_raw_LS_LS:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.l	@r4, r0
+2:	mov.l	r0, @r4
+	EPILOGUE()
+
+.global _perf_cpu_raw_EX_LS
+
+.align 4
+_perf_cpu_raw_EX_LS:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	mov	#0, r0
+
+1:	add	#1, r0
+2:	mov.l	r0, @r4
+	EPILOGUE()
+
+.global _perf_cpu_raw_LS_EX
+
+.align 4
+_perf_cpu_raw_LS_EX:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.l	@r4, r0
+2:	add	#1, r0
+	EPILOGUE()
+
+.global _perf_cpu_noraw_LS_LS
+
+.align 4
+_perf_cpu_noraw_LS_LS:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.l	@r4, r0
+2:	mov.l	r1, @r4
+	EPILOGUE()
+
+.global _perf_cpu_noraw_LS_EX
+
+.align 4
+_perf_cpu_noraw_LS_EX:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.l	@r4, r0
+2:	add	#1, r1
+	EPILOGUE()
+
+.global _perf_cpu_raw_EX_LS_addr
+
+.align 4
+_perf_cpu_raw_EX_LS_addr:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	add	#0, r4
+2:	mov.l	r0, @r4
+	EPILOGUE()
+
+.global _perf_cpu_raw_DSPLS_DSPLS
+
+.align 4
+_perf_cpu_raw_DSPLS_DSPLS:
+	PROLOGUE(512)
+	mov.l	.buffer, r4
+	mov	r4, r5
+
+1:	movs.w	@r4, x0
+2:	movs.w	x0, @r5
+	EPILOGUE()
+
+/* [Iteration weaving]
+
+   In this section we analyze how iterations can be woven and opened to improve
+   performance by reducing RAW dependencies. */
+
+.global _perf_cpu_darken_1
+
+.align 4
+_perf_cpu_darken_1:
+	PROLOGUE(512)
+	mov.l	.buffer, r4
+	mov	r4, r5
+	add	#-4, r5
+	nop
+
+1:	mov.l	@r4+, r1
+	and	r2, r1
+	add	#4, r5
+	shlr	r1
+2:	mov.l	r1, @r5
+	EPILOGUE()
+
+.global _perf_cpu_darken_2
+
+.align 4
+_perf_cpu_darken_2:
+	PROLOGUE(512)
+	mov.l	.buffer, r4
+	mov	r4, r5
+	add	#-4, r5
+	nop
+
+1:	mov.l	@r4+, r1
+	add	#4, r5
+	and	r2, r1
+	shlr	r1
+2:	mov.l	r1, @r5
+	EPILOGUE()
+
+.global _perf_cpu_darken_3
+
+.align 4
+_perf_cpu_darken_3:
+	PROLOGUE(256)
+	mov.l	.buffer, r4
+	mov	r4, r5
+	add	#-8, r5
+	nop
+
+1:	mov.l	@r4+, r1
+	add	#8, r5
+	mov.l	@r4+, r3
+	and	r2, r1
+	shlr	r1
+	mov.l	r1, @r5
+	and	r2, r3
+	shlr	r3
+2:	mov.l	r3, @(4,r5)
+	EPILOGUE()
+
+.global _perf_cpu_darken_4
+
+.align 4
+_perf_cpu_darken_4:
+	PROLOGUE(256)
+	mov.l	.buffer, r4
+	mov	r4, r5
+	add	#-8, r5
+	mov.l	@r4+, r1
+
+	/* Loop starts with r1 loaded, finishes with r1 loaded */
+1:	mov.l	@r4+, r3
+	add	#8, r5
+	and	r2, r1
+	shlr	r1
+	mov.l	r1, @r5
+	mov.l	@r4+, r1
+	and	r2, r3
+	shlr	r3
+2:	mov.l	r3, @(4,r5)
+	EPILOGUE()
+
+/* [Advanced dependencies]
+
+   This section measures the delay needed to use registers depending on the
+   type of instruction which modifies them. */
+
+.global _perf_cpu_double_read
+
+.align 4
+_perf_cpu_double_read:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.l	@r4, r0
+2:	mov.l	@r4, r1
+	EPILOGUE()
+
+.global _perf_cpu_double_incr_read
+
+.align 4
+_perf_cpu_double_incr_read:
+	PROLOGUE(1024)
+	mov.l	.buffer, r4
+	nop
+
+1:	mov.b	@r4+, r0
+2:	mov.b	@r4+, r0
+	EPILOGUE()
+
+/* [2D texture copy]
+
+   This section is used to investigate the performance of the 2D texture shader
+   of azur. */
+
+.global _perf_cpu_tex2d
+
+.align 4
+_perf_cpu_tex2d:
+	PROLOGUE(512)
+	mov.l	.buffer2, r3
+	mov	r3, r5 /*.buffer, r5 */
+
+1:	movs.l	@r3+, x0
+2:	movs.l	x0, @r5+
+	EPILOGUE()
+
+/* XRAM buffer */
+
+.align 4
+.buffer:
+	.long	_cpu_perf_xram_buffer
+.buffer2:
+	.long	_buffer2
+
+.section .data
+_buffer2:
+	.zero	2048
diff --git a/src/perf/cpu.c b/src/perf/cpu.c
index aa7e323..c7565b7 100644
--- a/src/perf/cpu.c
+++ b/src/perf/cpu.c
@@ -5,19 +5,16 @@
 #include <gintctl/perf.h>
 #include <gintctl/util.h>
 
+#include <gintctl/widgets/gscreen.h>
+#include <gintctl/widgets/gtable.h>
+
 #include <libprof.h>
 
-/* Baseline */
-void perf_cpu_empty(void);
-/* Loop control */
-void perf_cpu_nop_2048x1(void);
-void perf_cpu_nop_1024x2(void);
-void perf_cpu_nop_512x4(void);
-void perf_cpu_nop_256x8(void);
-/* Parallel execution */
-void perf_cpu_EX_EX(void);
-void perf_cpu_MT_MT(void);
-void perf_cpu_LS_LS(void);
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+GXRAM uint32_t cpu_perf_xram_buffer[512];
 
 /* Is subtracted from result times if specified; in TMU units (prof.elapsed) */
 static uint32_t baseline_ticks = 0;
@@ -39,14 +36,22 @@ uint32_t Iphi_cycles(void (*function)(void))
 
 /* Number of CPU cycles per iteration; the number of iterations must obviously
    match assembler code for that test */
-float Iphi_per_iteration(void (*function)(void), int count)
+int Iphi_cycles_per_iteration(int total, int count)
 {
-	return (float)Iphi_cycles(function) / count;
+	div_t d = div(total, count);
+
+	if(d.rem < 128)
+		return d.quot;
+	if(d.rem > count - 128)
+		return d.quot + 1;
+
+	return -1;
 }
 
 /* Number of TMU cycles for an empty function */
 uint32_t TMU_baseline(void)
 {
+	void perf_cpu_empty(void);
 	prof_t perf = prof_make();
 
 	for(int i = 0; i < 16; i++)
@@ -61,62 +66,132 @@ uint32_t TMU_baseline(void)
 
 //---
 
+struct results {
+	int nop_2048x1, nop_1024x2, nop_512x4, nop_256x8;
+	int EX_EX, MT_MT, LS_LS;
+	int align_4, align_2;
+	int pipeline_1, pipeline_2, pipeline_3;
+	int raw_EX_EX, raw_LS_LS, raw_EX_LS, raw_LS_EX;
+	int noraw_LS_LS, noraw_LS_EX;
+	int raw_EX_LS_addr, raw_DSPLS_DSPLS;
+	int darken_1, darken_2, darken_3, darken_4;
+	int double_read, double_incr_read;
+	int tex2d;
+};
+
+/* Number of Iphi cycles total, and number of iterations */
+static struct results r_cycles, r_iter;
+
+static void table_gen(gtable *t, int row)
+{
+	static char const *names[] = {
+		"Single nop", "2 nop", "4 nop", "8 nop",
+		"EX/EX pair", "MT/MT pair", "LS/LS pair",
+		"4-aligned parallel pair", "2-aligned parallel pair",
+		"mac.w/nop pipeline", "mac.w/mac.w pipeline",
+		  "mac.w/nop*5 pipeline",
+		"RAW dep.: EX/EX", "RAW dep.: LS/LS", "RAW dep.: EX/LS",
+		  "RAW dep.: LS/EX",
+		  "No dep.: LS/LS", "No dep.: LS/EX",
+		  "RAW on address: EX/LS",
+		  "RAW dep.: DSP-LS/DSP-LS",
+		"32-bit VRAM darken #1", "32-bit VRAM darken #2",
+		  "Interwoven darken", "Interwoven open darken",
+		"Double read", "Double increment read",
+		"Texture2D shader",
+	};
+
+	int cycles = ((int *)&r_cycles)[row];
+	int iter = ((int *)&r_iter)[row];
+	int cpi = Iphi_cycles_per_iteration(cycles, iter);
+
+	char c2[16], c3[16], c4[16];
+	sprintf(c2, "%d", cpi);
+	sprintf(c3, "%d", cycles);
+	sprintf(c4, "%d", iter);
+
+	gtable_provide(t, names[row], (cpi == -1 ? "-" : c2), c3, c4);
+}
+
 void gintctl_perf_cpu(void)
 {
+	memset(&r_cycles, 0, sizeof r_cycles);
+	memset(&r_iter, 0, sizeof r_iter);
+
+	gtable *table = gtable_create(4, table_gen, NULL, NULL);
+	gtable_set_rows(table, sizeof r_cycles / sizeof(int));
+	gtable_set_row_spacing(table, _(1,2));
+	gtable_set_column_titles(table, "Name", "CPI", "Cycles", "Iter.");
+	gtable_set_column_sizes(table, 6, 1, 2, 2);
+	gtable_set_font(table, _(&font_mini, dfont_default()));
+	jwidget_set_margin(table, 0, 2, 1, 2);
+
+	gscreen *scr = gscreen_create2("CPU parallelism", &img_opt_perf_cpu,
+		"CPU instruction parallelism and pipelining", "@RUN;;;;;");
+	gscreen_add_tabs(scr, table, table);
+	jscene_set_focused_widget(scr->scene, table);
+
 	int key = 0;
+	while(key != KEY_EXIT) {
+		jevent e = jscene_run(scr->scene);
 
-	/* Measure baseline time */
-	baseline_ticks = TMU_baseline();
+		if(e.type == JSCENE_PAINT) {
+			dclear(C_WHITE);
+			jscene_render(scr->scene);
+			dupdate();
+		}
 
-	uint32_t Iphi_cpu_nop_2048x1 = 0;
-	uint32_t Iphi_cpu_nop_1024x2 = 0;
-	uint32_t Iphi_cpu_nop_512x4 = 0;
-	uint32_t Iphi_cpu_nop_256x8 = 0;
+		key = 0;
+		if(e.type == JSCENE_KEY && e.key.type == KEYEV_DOWN)
+			key = e.key.key;
 
-	uint32_t Iphi_cpu_EX_EX = 0;
-	uint32_t Iphi_cpu_MT_MT = 0;
-	uint32_t Iphi_cpu_LS_LS = 0;
+		if(key == KEY_F1) {
+			baseline_ticks = TMU_baseline();
 
-	while(key != KEY_EXIT)
-	{
-		dclear(C_WHITE);
+			#define run(name, iter) {								\
+				extern void perf_cpu_ ## name (void);				\
+				r_cycles.name = Iphi_cycles(perf_cpu_ ## name);		\
+				r_iter.name = iter;									\
+			}
 
-		#ifdef FXCG50
-		row_title("CPU instruction parallelism and pipelining");
+			run(nop_2048x1, 2048);
+			run(nop_1024x2, 1024);
+			run(nop_512x4, 512);
+			run(nop_256x8, 256);
 
-		row_print(1, 1, "Baseline ticks: %d",
-			baseline_ticks);
-		row_print(3, 1, "Iphi cycles for 2048x1 nop: %d",
-			Iphi_cpu_nop_2048x1);
-		row_print(4, 1, "Iphi cycles for 1024x2 nop: %d",
-			Iphi_cpu_nop_1024x2);
-		row_print(5, 1, "Iphi cycles for 512x4 nop: %d",
-			Iphi_cpu_nop_512x4);
-		row_print(6, 1, "Iphi cycles for 256x8 nop: %d",
-			Iphi_cpu_nop_256x8);
-		row_print(8, 1, "Iphi cycles for EX/EX: %d",
-			Iphi_cpu_EX_EX);
-		row_print(9, 1, "Iphi cycles for MT/MT: %d",
-			Iphi_cpu_MT_MT);
-		row_print(10, 1, "Iphi cycles for LS/LS: %d",
-			Iphi_cpu_LS_LS);
+			run(EX_EX, 1024);
+			run(MT_MT, 1024);
+			run(LS_LS, 1024);
 
-		fkey_button(1, "RUN");
-		#endif
+			run(align_4, 1024);
+			run(align_2, 1024);
 
-		dupdate();
-		key = getkey().key;
+			run(pipeline_1, 1024);
+			run(pipeline_2, 1024);
+			run(pipeline_3, 1024);
 
-		if(key == KEY_F1)
-		{
-			Iphi_cpu_nop_2048x1 = Iphi_cycles(perf_cpu_nop_2048x1);
-			Iphi_cpu_nop_1024x2 = Iphi_cycles(perf_cpu_nop_1024x2);
-			Iphi_cpu_nop_512x4  = Iphi_cycles(perf_cpu_nop_512x4);
-			Iphi_cpu_nop_256x8  = Iphi_cycles(perf_cpu_nop_256x8);
+			run(raw_EX_EX, 1024);
+			run(raw_LS_LS, 1024);
+			run(raw_EX_LS, 1024);
+			run(raw_LS_EX, 1024);
+			run(noraw_LS_LS, 1024);
+			run(noraw_LS_EX, 1024);
+			run(raw_EX_LS_addr, 1024);
+			run(raw_DSPLS_DSPLS, 512);
 
-			Iphi_cpu_EX_EX = Iphi_cycles(perf_cpu_EX_EX);
-			Iphi_cpu_MT_MT = Iphi_cycles(perf_cpu_MT_MT);
-			Iphi_cpu_LS_LS = Iphi_cycles(perf_cpu_LS_LS);
+			run(darken_1, 512);
+			run(darken_2, 512);
+			run(darken_3, 256);
+			run(darken_4, 256);
+
+			run(double_read, 1024);
+			run(double_incr_read, 1024);
+
+			run(tex2d, 512);
+
+			table->widget.update = 1;
 		}
 	}
+
+	gscreen_destroy(scr);
 }