From 911cc8e5ace8acc7f9bfead3299061e93037a114 Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Tue, 28 Sep 2021 14:29:09 +0200
Subject: [PATCH] azur: documentation and optimization on rendering

---
 CMakeLists.txt                  |   4 +-
 azur/include/azur/gint/render.h |  16 ++--
 azur/src/gint/render.c          |  78 +++++++++---------
 azur/src/gint/shaders/clear.S   |  15 ++++
 azur/src/gint/shaders/clear.c   |   2 +-
 azur/src/gint/shaders/image.S   | 137 +++++++++++++++++++++++++-------
 azur/src/gint/shaders/image.c   |  15 ++--
 7 files changed, 182 insertions(+), 85 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5c58d9..c0e7610 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ if("${FXSDK_PLATFORM_LONG}" STREQUAL fxCG50)
 endif()
 
 # General options
-add_compile_options(-Wall -Wextra -O2
+add_compile_options(-Wall -Wextra -O3
   -fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
 
 set(CMAKE_C_STANDARD 11)
@@ -60,7 +60,7 @@ if(FACETS_PLATFORM STREQUAL emscripten)
 
   set(PORTS -sUSE_SDL=2 -sUSE_SDL_IMAGE=2 -sSDL2_IMAGE_FORMATS=["png"])
   add_compile_options(${PORTS})
-  add_link_options(${PORTS} -O2)
+  add_link_options(${PORTS} -O3)
 endif()
 
 #---
diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h
index 9933b3e..849c5c3 100644
--- a/azur/include/azur/gint/render.h
+++ b/azur/include/azur/gint/render.h
@@ -49,7 +49,7 @@ extern uint16_t azrp_frag[];
 
 /* Maximum number of commands that can be queued. (This is only one of two
    limits, the other being the size of the command data.) */
-#define AZRP_MAX_COMMANDS 512
+#define AZRP_MAX_COMMANDS 256
 
 /* Maximum number of shaders that can be defined. (This is a loose limit). */
 #define AZRP_MAX_SHADERS 32
@@ -212,8 +212,8 @@ extern prof_t azrp_perf_shaders;
 /* This counter runs during CPU transfers to the R61524 display. */
 extern prof_t azrp_perf_r61524;
 
-/* This counter runs during the whole azrp_update() operation; it is the sum of
-   sort, shaders, r61524, plus some logic overhead. */
+/* This counter runs during rendering; it is the sum of shaders and r61524,
+   plus some logic overhead. */
 extern prof_t azrp_perf_render;
 
 /* azrp_perf_clear(): Clear all performance counters
@@ -243,12 +243,10 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
 
 /* azrp_queue_command(): Add a new command to be rendered next frame
 
-   The command must be a structure starting with an 8-bit shader ID and an
-   8-bit fragment ID.
-
-   Returns true on success, false if the maximum amount of commands or command
-   memory is exceeded. */
-bool azrp_queue_command(void *command, size_t size);
+   The command must be a structure starting with an 8-bit shader ID. Returns
+   true on success, false if the maximum amount of commands or command memory
+   is exceeded. */
+bool azrp_queue_command(void *command, size_t size, int fragment);
 
 //---
 // Internal shader definitions (for reference; no API guarantee)
diff --git a/azur/src/gint/render.c b/azur/src/gint/render.c
index 457ebc9..9f7c19f 100644
--- a/azur/src/gint/render.c
+++ b/azur/src/gint/render.c
@@ -8,7 +8,8 @@
 
 #define YRAM ((void *)0xe5017000)
 
-/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM. */
+/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
+   TODO: Extend this to 16 rows, and move the rest to RAM */
 GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
 
 /* Super-scaling factor, width and height of output. */
@@ -26,11 +27,11 @@ int azrp_frag_height;
 GXRAM int commands_count = 0, commands_length = 0;
 
 /* Array of pointers to queued commands (stored as an offset into YRAM). */
-GXRAM uint16_t commands_array[AZRP_MAX_COMMANDS];
+GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
 
 /* Array of shader programs and uniforms. */
-static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
-static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
+GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
+GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
 
 /* Next free index in the shader program array. */
 GXRAM static uint16_t shaders_next = 0;
@@ -54,31 +55,25 @@ void azrp_clear_commands(void)
 
 /* Custom quick sort for commands */
 
-static inline int compare(int8_t *c1, int8_t *c2)
-{
-    int d = c1[1] - c2[1];
-    return (d ? d : c1 - c2);
-}
-
 static void cmdsort(int low, int high)
 {
     if(low >= high) return;
 
-    int8_t *pivot = YRAM + commands_array[(low + high) >> 1];
+    uint32_t pivot = commands_array[(low + high) >> 1];
 
     int i = low - 1;
     int j = high + 1;
 
     while(1) {
         do i++;
-        while(compare(YRAM + commands_array[i], pivot) < 0);
+        while(commands_array[i] < pivot);
 
         do j--;
-        while(compare(YRAM + commands_array[j], pivot) > 0);
+        while(commands_array[j] > pivot);
 
         if(i >= j) break;
 
-        uint16_t tmp = commands_array[i];
+        uint32_t tmp = commands_array[i];
         commands_array[i] = commands_array[j];
         commands_array[j] = tmp;
     }
@@ -89,44 +84,48 @@ static void cmdsort(int low, int high)
 
 void azrp_sort_commands(void)
 {
-    prof_enter(azrp_perf_sort);
+    prof_enter_norec(azrp_perf_sort);
     cmdsort(0, commands_count - 1);
-    prof_leave(azrp_perf_sort);
+    prof_leave_norec(azrp_perf_sort);
 }
 
+int azrp_commands_total;
+
 void azrp_render_fragments(void)
 {
-    prof_enter(azrp_perf_render);
+    prof_enter_norec(azrp_perf_render);
+
+    azrp_commands_total = 0;
 
     int i = 0;
     int frag = 0;
+    uint32_t next_frag_threshold = (frag + 1) << 16;
+    uint32_t cmd = commands_array[i];
 
-    uint8_t *cmd = (uint8_t *)YRAM + commands_array[i];
-
-    prof_enter(azrp_perf_r61524);
+    prof_enter_norec(azrp_perf_r61524);
     r61524_start_frame(0, 244);
-    prof_leave(azrp_perf_r61524);
+    prof_leave_norec(azrp_perf_r61524);
 
     while(1) {
-        if(cmd[1] == frag) {
-            if(shaders[cmd[0]]) {
-                prof_enter(azrp_perf_shaders);
-                shaders[cmd[0]](shader_uniforms[cmd[0]], cmd, azrp_frag);
-                prof_leave(azrp_perf_shaders);
-            }
-            cmd = YRAM + commands_array[++i];
-        }
-        else {
-            prof_enter(azrp_perf_r61524);
-            /* TODO: Consider xram_frame() by DMA in parallel? */
-            xram_frame(azrp_frag, 396 * 8);
-            prof_leave(azrp_perf_r61524);
-            frag++;
-            if(frag >= azrp_frag_count) break;
+        while(cmd < next_frag_threshold && i < commands_count) {
+            azrp_commands_total++;
+            uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
+            prof_enter_norec(azrp_perf_shaders);
+            shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
+            prof_leave_norec(azrp_perf_shaders);
+            cmd = commands_array[++i];
         }
+
+        /* TODO: Consider xram_frame() by DMA in parallel? */
+        prof_enter_norec(azrp_perf_r61524);
+        xram_frame(azrp_frag, 396 * 8);
+        prof_leave_norec(azrp_perf_r61524);
+
+        if(++frag >= azrp_frag_count) break;
+        next_frag_threshold += (1 << 16);
     }
 
-    prof_leave(azrp_perf_render);
+    prof_leave_norec(azrp_perf_render);
 }
 
 void azrp_update(void)
@@ -210,7 +209,7 @@ void azrp_set_uniforms(int shader_id, void *uniforms)
     shader_uniforms[shader_id] = uniforms;
 }
 
-bool azrp_queue_command(void *command, size_t size)
+bool azrp_queue_command(void *command, size_t size, int fragment)
 {
     if(commands_count >= AZRP_MAX_COMMANDS)
         return false;
@@ -223,7 +222,8 @@ bool azrp_queue_command(void *command, size_t size)
     for(size_t i = 0; i < size; i++)
         dst[i] = src[i];
 
-    commands_array[commands_count++] = commands_length;
+    commands_array[commands_count++] =
+        (fragment << 16) | commands_length;
     commands_length += size;
 
     return true;
diff --git a/azur/src/gint/shaders/clear.S b/azur/src/gint/shaders/clear.S
index a343152..2d01116 100644
--- a/azur/src/gint/shaders/clear.S
+++ b/azur/src/gint/shaders/clear.S
@@ -1,3 +1,18 @@
+/* Azur's built-in shaders: <clear>
+
+   By far the easiest of all. The clear shader is a good benchmark for how fast
+   the rendering pipeline is. And it's pretty damn fast, clocking in at 400 µs
+   for a full-resolution 396x224 clear (compared to 6.1 ms for a VRAM clear by
+   CPU or 2.5 ms by DMA).
+
+   Because this is performed by CPU and therefore versatile, there are many
+   variations with more complex patterns that will perform at the same speed.
+   A gray tiled background for transparency in image viewing comes to mind, for
+   example.
+
+   Affected region: full-screen
+   Asymptotic performance: 0.5 cycle/pixel */
+
 .global _azrp_shader_clear
 .align 4
 
diff --git a/azur/src/gint/shaders/clear.c b/azur/src/gint/shaders/clear.c
index cf1fefa..a2bd03f 100644
--- a/azur/src/gint/shaders/clear.c
+++ b/azur/src/gint/shaders/clear.c
@@ -35,7 +35,7 @@ void azrp_clear(uint16_t color)
 
     for(int i = 0; i < azrp_frag_count; i++) {
         cmd.fragment_id = i;
-        azrp_queue_command(&cmd, sizeof cmd);
+        azrp_queue_command(&cmd, sizeof cmd, i);
     }
 
     prof_leave(azrp_perf_cmdgen);
diff --git a/azur/src/gint/shaders/image.S b/azur/src/gint/shaders/image.S
index 3fa6d30..396b5d7 100644
--- a/azur/src/gint/shaders/image.S
+++ b/azur/src/gint/shaders/image.S
@@ -1,3 +1,77 @@
+/* Azur's built-in shaders: <image>
+
+   If there ever was a fantastic piece of assembler engineering in my work up
+   to this point, this would be it. Every trick in the book is used here, from
+   clever instruction combinations, pipeline flow and tricky DSP abuse all the
+   way up to memory layout planning, transforms on loop structures, and most
+   critically superscalar parallelism.
+
+   While the performance of the shader is not *strictly* proportional to the
+   speed of the tightest loop, it's very close. The use of operand-bus XRAM for
+   graphics data, systematic alignment, and detailed pipeline stalling
+   measurements for common instruction sequences in gintctl allow very accurate
+   speed predictions to be made based on the tightness of the code.
+
+   The palette formats of bopti have been refined for the purpose of this
+   shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
+   and P4 being renamed P4_RGB565A with minimal changes along with a variation
+   aptly named P4_RGB565.
+
+   The asymptotic performance for each format is as follows:
+   * RGB565:      1    cycle/pixel if source and destination align
+                  2   cycles/pixel otherwise
+   * RGB565A:     4   cycles/pixel
+   * P8_RGB565A:  4.5 cycles/pixel
+   * P8_RGB565:   3   cycles/pixel
+   * P4_RGB565A:  5   cycles/pixel
+   * P4_RGB565:   3.5 cycles/pixel
+
+   Entirely documenting this code would take me hours, but some elements are
+   provided in the comments. Superscalar parallelism is most easily appreciated
+   by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
+   structural technique at play in this code is loop transforms.
+
+   Basically, a loop that loads a pixel, performs computations with it, and
+   writes the result is inefficient because of the RAW dependencies on most
+   operations (with full stall cycles between loads and computations, and
+   between computations and uses as addresses). Well-established loop
+   optimization literature has lots of techniques to help with this problem,
+   and I use two here:
+
+   * _Pipelining_ the loop consists in handling a single pixel over several
+     iterations by doing a little bit of work in each iteration. The data for
+     the pixel would move from register to register at each iteration, with the
+     loop code doing one stage's worth of computation on each register. (You
+     can view it as a diagonal iteration pattern in the pixel*instruction grid
+     if you like such visualizations.)
+
+     By increasing the number of pixels in the pipeline, a lot of independent
+     data can be obtained, reducing dependency pressure and allowing for
+     greater parallelism at the cost of more registers being used.
+
+     The use of pipelining in this shader is very modest, with 2 stages at
+     most, and usually only a couple of instructions being performed in advance
+     for the next pixel while the current one finishes processing. Register
+     assignments have some subtleties though since pressure is high overall.
+
+   * _Unrolling_ iterations of the loop consists in loading two (or more)
+     pixels at the start of each iteration so that we can work on one while
+     waiting for stalls and dependencies on the other.
+
+     Unlike pipelining, a loop iteration starts and ends with full pixels and
+     no work carries between iterations. Unrolling allows different pixels to
+     use different registers and generally better optimize the instruction
+     sequence, at the cost of only supporting pixel counts that are multipes of
+     the unrolling level.
+
+     Handling non-multiple sizes is the everlasting bane of unrolled loops,
+     sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
+     to only handle even sizes and neutralize unwanted pixels after the fact.
+
+   Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
+   almost all formats (except RGB556A which performs DSP trickery).
+*/
+
 .global _azrp_shader_image
 .align 4
 
@@ -64,19 +138,20 @@ _azrp_shader_image:
    * r3 is the input (with stride r9, in bytes)
    * There are r1 rows with r7 iterations each */
 
-#define START()		\
+#define START()			\
+	nop; /* 4-alignment */	\
 	ldrs	2f;		\
 	ldre	3f;		\
 1:	ldrc	r7
 
-#define END_NORET()	\
+#define END_NORET()		\
 	dt	r1;		\
 	add	r4, r5;		\
 	bf.s	1b;		\
 	add	r9, r3
 
-#define END()		\
-	END_NORET();	\
+#define END()			\
+	END_NORET();		\
 	mov.l	@r15+, r9;	\
 	rts;			\
 	mov.l	@r15+, r8
@@ -100,10 +175,10 @@ _azrp_shader_image:
    a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
    not help because of the stall cycle between loading a register and using it
    in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
-   the word-based copy). Weaving iterations could help but would be too complex
-   here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
-   tileset shader) should aim for that route though. Also, movua.l followed by
-   mov.l is even slower (5 cycles). */
+   the word-based copy). Unrolling iterations could help but would be too
+   complex here (adding sub-cases); a super-heavy renderer with more hypotheses
+   (like a tileset shader) should aim for that route though. Also, movua.l
+   followed by mov.l is even slower (5 cycles). */
 .align 4
 _RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
@@ -130,12 +205,14 @@ _RGB565.we:
 	tst	r0, r5
 	bf	_RGB565.we_do
 
+/* This is 4-aligned */
 _RGB565.we_de:
 	START()
 2:	movs.l	@r3+, x0
 3:	movs.l	x0, @r5+
 	END()
 
+.align 4
 _RGB565.we_do:
 	add	#-1, r7
 
@@ -150,6 +227,7 @@ _RGB565.we_do:
 	movs.w	x0, @r5+
 	END()
 
+.align 4
 _RGB565.wo:
 	tst	r0, r5
 	bf	_RGB565.wo_do
@@ -163,6 +241,7 @@ _RGB565.wo_de:
 	movs.w	x0, @r5+
 	END()
 
+.align 4
 _RGB565.wo_do:
 	START()
 	movs.w	@r3+, x0
@@ -173,6 +252,7 @@ _RGB565.wo_do:
 	END()
 
 /* Naive method for small widths and opposite source/destination parity */
+.align 4
 _RGB565.naive:
 	START()
 2:	movs.w	@r3+, x0
@@ -210,12 +290,13 @@ _RGB565A:
 
    The work needed for each pixel gets more difficult as we go, with alpha
    being the major culprit due to its additional comparisons, jumps, and
-   limited interweaving opportunities due to conditionally-executed code.
+   limited optimization opportunities when unrolling due to conditionally-
+   executed code.
 
    Because arithmetic is unavoidable and there are 1-cycle delays between both
-   loading-arithmetic, and arithmetic-indexing pairs, the loop has 2 interwoven
-   iterations with an open structure. This fills the stall cycles and increases
-   parallelism significantly. Pure interweaving handbook.
+   loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
+   iterations with a 2-stage pipeline structure. This fills the stall cycles
+   and increases parallelism significantly. Pure loop optimization handbook.
 
    Dealing with odd widths is a major pain as usual. Instead of adding logic to
    handle the extra pixel separately, this routine lets the loop overwrite it,
@@ -252,7 +333,7 @@ _P8_RGB565A:
 	sub	r7, r9
 
 	mov	r7, r13
-	add	#-2, r9 /* Input stride compensation for openness */
+	add	#-2, r9 /* Input stride compensation for pipelining */
 
 	mov.l	r12, @-r15
 	shlr	r7
@@ -281,7 +362,6 @@ _P8_RGB565A:
 	shll2	r2
 
 	add	r4, r2
-	nop /* 4-alignment */
 
 	START()
 
@@ -293,7 +373,7 @@ _P8_RGB565A:
 	mov.b	@r3+, r10
 	tst	r6, r6
 
-	/* 2-interwoven open main loop */
+	/* 2-unrolled 2-stage main loop */
 2:	add	r6, r6
 	mov	r6, r0
 
@@ -346,7 +426,7 @@ _P8_RGB565:
 	sub	r7, r9
 
 	mov	r7, r13
-	add	#-2, r9 /* Input stride compensation for openness */
+	add	#-2, r9 /* Input stride compensation for pipelining */
 
 	mov.l	r12, @-r15
 	shlr	r7
@@ -375,7 +455,6 @@ _P8_RGB565:
 	shll2	r2
 
 	add	r4, r2
-	nop /* 4-alignment */
 
 	START()
 
@@ -387,11 +466,14 @@ _P8_RGB565:
 	mov.b	@r3+, r10
 	shll	r0
 
-	/* 2-interwoven open main loop */
+	/* 2-unrolled 2-stage main loop */
 2:	mov.b	@r3+, r6
 	shll	r10
 
 	mov.w	@(r0,r8), r0
+	/* This nop is not for show, it actually prevents the loop from slowing
+	   down to 7 cycles /i, probably due to instruction reads alignment. */
+	nop
 
 	mov.w	r0, @(4,r5)
 	mov	r10, r0
@@ -434,8 +516,8 @@ _P8_RGB565.palette_distance:
    The special nature of the nibble packing means the simplest loop form writes
    2 pixels from a 2-aligned source image position in a single iteration. Other
    structures don't even come close: selecting nibbles individually is folly,
-   while not interweaving is inefficient. So the whole point of this routine is
-   to forcibly align the subimage on a byte-aligned and never break that grid.
+   while not unrolling is inefficient. So the whole point of this routine is to
+   forcibly align the subimage on a byte-aligned and never break that grid.
 
    The command builder for P4 does this alignment before submitting the
    command. Obviously the transform can cause one extra pixel to be overridden
@@ -443,7 +525,7 @@ _P8_RGB565.palette_distance:
    offsets indicating pixels to preserve at each end. When overwrites occurs,
    the edge offsets point to the overwritten pixels so they can be restored.
    Otherwise, they point to the next pixels and the restores are no-ops. See
-   the strategy used for managing interweaving in P8 formats for details.
+   the strategy used for managing unrolling in P8 formats for details.
 
    The only irregularity is image width, which the command builder cannot
    modify. It is rounded up to the next multiple of 2, then halved. There is a
@@ -466,10 +548,10 @@ _P4_RGB565A:
 	mov.l	r12, @-r15
 	sub	r7, r9
 
-	mov.w	@r2+, r11 /* command.edge1 */
-	add	#2, r8 /* image.palette */
+	mov.w	@r2+, r11	/* command.edge1 */
+	add	#2, r8		/* image.palette */
 
-	mov.w	@r2+, r12 /* command.edge2 */
+	mov.w	@r2+, r12	/* command.edge2 */
 	mov	r5, r0
 
 	mov.l	r13, @-r15
@@ -479,6 +561,7 @@ _P4_RGB565A:
 	shll	r12
 
 	add	#-4, r5
+	nop	/* 4-alignment */
 
 	START()
 
@@ -559,10 +642,10 @@ _P4_RGB565:
 	mov.l	r12, @-r15
 	sub	r7, r9
 
-	mov.w	@r2+, r11 /* command.edge1 */
-	add	#2, r8 /* image.palette */
+	mov.w	@r2+, r11	/* command.edge1 */
+	add	#2, r8		/* image.palette */
 
-	mov.w	@r2+, r12 /* command.edge2 */
+	mov.w	@r2+, r12	/* command.edge2 */
 	mov	r5, r0
 
 	mov.l	r13, @-r15
diff --git a/azur/src/gint/shaders/image.c b/azur/src/gint/shaders/image.c
index 4d5f3b6..37ee6de 100644
--- a/azur/src/gint/shaders/image.c
+++ b/azur/src/gint/shaders/image.c
@@ -73,18 +73,19 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
     /* This divides by azrp_frag_height */
     cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
 
+    /* These settings only apply to the first fragment */
+    int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
+    cmd.lines = azrp_frag_height - first_y;
+    cmd.output = 2 * (azrp_width * first_y + x);
+
     while(height > 0) {
-        cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
+        azrp_queue_command(&cmd, cmd_size, cmd.fragment_id);
 
-        cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
-
-        y += cmd.lines;
-        top += cmd.lines;
         height -= cmd.lines;
-
-        azrp_queue_command(&cmd, cmd_size);
         cmd.fragment_id++;
         cmd.input += row_stride * cmd.lines;
+        cmd.lines = min(height, azrp_frag_height);
+        cmd.output = 2 * x;
     }
 
     prof_leave(azrp_perf_cmdgen);