From ddff9f6d6bdb0608fb4b83d2ddc01e47a2ba51c4 Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Fri, 24 Sep 2021 22:56:40 +0200
Subject: [PATCH] azur: replace P8 with P8_RGB565A (4.5 c/p), P8_RGB565 (3 c/p)

The code for P8 failed in some non-transparent cases and I'll admit I
could not be bothered to fix it when the superiors formats were already
designed and promised a significant boost.
---
 azur/src/gint/shaders/tex2d.S | 245 ++++++++++++++++++++++------------
 azur/src/gint/shaders/tex2d.c |   9 +-
 2 files changed, 163 insertions(+), 91 deletions(-)

diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index 8922df8..bec3c47 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -53,9 +53,10 @@ _azrp_shader_tex2d:
 .formats:
 	.long	_RGB565
 	.long	_RGB565A
-	.long	_P8
+	.long	_NOP
 	.long	_P4
 	.long	_P8_RGB565
+	.long	_P8_RGB565A
 
 /* [Loop macros]
 
@@ -105,7 +106,7 @@ _azrp_shader_tex2d:
    here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
    tileset shader) should aim for that route though. Also, movua.l followed by
    mov.l is even slower (5 cycles). */
-
+.align 4
 _RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
 	cmp/ge	r7, r0
@@ -183,8 +184,8 @@ _RGB565.naive:
    longword-based optimization. Instead, we just go as fast as possible with
    each pixel, using DSP instructions because conditional execution is pretty
    damn good. This takes 4 cycles/pixel. I tried a number of reductions to
-   3 cycles/pixel but could not get that to work. */
-
+   3 cycles/pixel but could not get any of them to work. */
+.align 4
 _RGB565A:
 	shll16	r6
 	mov	#0x0004, r0 /* DC Zero mode */
@@ -202,152 +203,216 @@ _RGB565A:
 3:	                        movx.w  x0, @r5+
 	TEX2D_END()
 
-/* [Rendering strategy for the P8 format]
+/* [Rendering strategy for the P8_RGB565A format]
 
-   The work needed for each pixel gets more difficult as we go. In P8 there is
-   both a palette indexing step (which induces some latency when moving values
-   read from memory to the ALU, unlike RGB565), and an alpha comparison check.
+   The work needed for each pixel gets more difficult as we go, with alpha
+   being the major culprit due to its additional comparisons, jumps, and
+   limited interweaving opportunities due to conditionally-executed code.
 
-   The rendering uses a 2-interwoven open loop. This reduces stall cycles and
-   increases parallelism. Dealing with non-multiple widths is annoying as
-   usual. Instead this routine avoids the clipping problem by overwriting then
-   restoring the next pixel. (A delightfully smart workaround if you ask me.)
+   Because arithmetic is unavoidable and there are 1-cycle delays between both
+   loading-arithmetic, and arithmetic-indexing pairs, the loop has 2 interwoven
+   iterations with an open structure. This fills the stall cycles and increases
+   parallelism significantly. Pure interweaving handbook.
 
-   Unless I have missed something this routine achieves 5.5 cycles/pixel.
+   Dealing with odd widths is a major pain as usual. Instead of adding logic to
+   handle the extra pixel separately, this routine lets the loop overwrite it,
+   then restores its original value afterwards - a delightfully elegant trick.
 
-   The format is not extremely friendly. It has alpha for all images, and uses
-   a non-zero value for it, which burns a register. Palette indices are
-   unsigned, which requires an extu.b even though the palette could be indexed
-   with signed values by moving the pointer. Also the palette always takes up
-   512 bytes even when a low amount of colors is used.
+   The P8 format is actually so bad that spending precious time grinding cycles
+   felt completely inappropriate without first refining it. This led to two new
+   variations, P8_RGB565 and P8_RGB565A, which fix the following problems.
 
-   The P8_RGB565 and P8_RGB565A address these issues and supplant P8. In the
-   interim this version of P8 is reasonably elegant despite ample extra
-   registers and initial computations. */
-_P8:
+   -> First there is alpha for all images, which is the most costly feature,
+      single-handedly accounting for half of the work per pixel. P8_RGB565
+      does no support alpha, which basically doubles performance.
+
+   -> Then, there is the alpha value itself. In P8 it is a variable (and fxconv
+      sets it to 0xff), which burns a register for the comparison and enforces
+      a fixed order between comparison and left-shift. P8_RGB565A always sets
+      an alpha value of 0x00 which lifts both constraints.
+
+   -> Then, there are palette indices. In P8 they are unsigned, which requires
+      an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign-
+      extended value of the mov.b can be used directly (once doubled). The
+      palette base is simply offset by 128 entries, with colors numbered
+      -128..-1 first and only then 0..127.
+
+   -> Finally, there's the palette itself. In P8 it always has 256 entries,
+      even when only a few are used. For small images this is a huge waste, so
+      P8_RGB565 and P8_RGB565A only store colors that are actually used.
+
+   P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good
+   compared to 4 cycles/pixel for RGB565A. */
+.align 4
+_P8_RGB565A:
 	mov.l	r13, @-r15
-	add	#2, r8 /* Palette */
+	add	#-2, r9 /* Input stride compensation for openness */
 
 	mov	r7, r13
 	shlr	r7
 
 	mov.l	r12, @-r15
-	movt	r12
-
-	mov.l	r11, @-r15
-	add	r12, r7
+	movt	r6
 
 	mov.l	r10, @-r15
-	extu.b	r6, r6
-
 	shll	r13
 
-	add	#-1, r9
+	mov.w	_P8_RGB565A.palette_distance, r0
+	add	r6, r7
 
-	sub	r12, r9
+	sub	r6, r9
 
-	sub	r12, r4
+	sub	r6, r4
 
-	sub	r12, r4
+	sub	r6, r4
+
+	add	r0, r8
 
 	add	r5, r13
+	mov	r7, r2
+
+	add	#-4, r5 /* Output offset compensation in the loop */
+
+	shll2	r2
+
+	add	r4, r2
+	nop /* 4-alignment */
 
 	TEX2D_START()
 
-	/* Save the first pixel after the line. It will be restored at the end
-	   of the line to correct the odd-width case where the 2-interwoven
-	   main loop writes an additional pixel. */
+	mov.b	@r3+, r6
+
+	/* Save next pixel for the odd-width case */
 	mov.w	@r13, r12
 
-	mov.b	@r3+, r0
-
-2:	/* 2-interwoven open main loop */
 	mov.b	@r3+, r10
-	extu.b	r0, r0
+	tst	r6, r6
 
-	cmp/eq	r0, r6
-	mov.w	@r5+, r2
+	/* 2-interwoven open main loop */
+2:	add	r6, r6
+	mov	r6, r0
 
-	add	r0, r0 /* Don't use shll to keep T */
-	mov.w	@r5, r11
-
-	add	#-2, r5
+	add	r10, r10
 	bt.s	5f
 
-	extu.b	r10, r10
-	mov.w	@(r0,r8), r2
+	tst	r10, r10
+	mov.w	@(r0,r8), r0
 
-     5:	cmp/eq	r10, r6
+	mov.w	r0, @(4,r5)
 
-	add	r10, r10 /* Don't use shll to keep T */
+     5: mov.b	@r3+, r6
 	mov	r10, r0
 
-	mov.w	r2, @r5
-	add	#2, r5
+	bt.s	6f
+	add	#4, r5
 
-	bt	6f
-	mov.w	@(r0,r8), r11
+	mov.w	@(r0,r8), r0
 
-     6:	mov.b	@r3+, r0
+	mov.w	r0, @(2,r5)
 
-	mov.w	r11, @r5
-3:	add	#2, r5
+     6:	mov.b	@r3+, r10
+3:	tst	r6, r6
 
 	/* Restore last pixel */
 	mov.w	r12, @r13
-	add	r4, r13
-
-	mov	r7, r6
-	shll2	r6
-
-	add	r6, r13
+	add	r2, r13
 
 	TEX2D_END_NORET()
 	mov.l	@r15+, r10
-	mov.l	@r15+, r11
 	mov.l	@r15+, r12
 	mov.l	@r15+, r13
 	mov.l	@r15+, r9
 	rts
 	mov.l	@r15+, r8
 
-/* [Rendering strategy for the P8 RGB565 format]
+_P8_RGB565A.palette_distance:
+	/* Distance between image pointer and palette array base */
+	.word	260
 
-   This format does not support alpha, lifting the requirement for comparisons,
-   branches and some register logic. The palette is also designed to support
-   signed indices (from -128 to 127). The interwoven setup becomes much more
-   practical as a result. */
+/* [Rendering strategy for the P8_RGB565 format]
 
+   See P8_RGB565A for format details. Removing the checks for transparency and
+   the jumps simplifies the instruction sequence and allows superior
+   parallelism because all paths are unconditional. This routines achieves
+   3 cycles/pixel asymptotically. */
+.align 4
 _P8_RGB565:
+	mov.l	r13, @-r15
+	add	#-2, r9 /* Input stride compensation for openness */
+
+	mov	r7, r13
 	shlr	r7
-	/* TODO: Odd case */
 
-	mov.b	@r3+, r6
-	add	#-4, r5
+	mov.l	r12, @-r15
+	movt	r6
 
-	shll	r6
+	mov.l	r10, @-r15
+	shll	r13
+
+	mov.w	_P8_RGB565.palette_distance, r0
+	add	r6, r7
+
+	sub	r6, r9
+
+	sub	r6, r4
+
+	sub	r6, r4
+
+	add	r0, r8
+
+	add	r5, r13
+
+	add	#-4, r5 /* Output offset compensation in the loop */
+	mov	r7, r2
+
+	shll2	r2
+
+	add	r4, r2
+	nop /* 4-alignment */
 
 	TEX2D_START()
-2:	mov.b	@r3+, r2
+
+	mov.b	@r3+, r0
+
+	/* Save next pixel for the odd-width case */
+	mov.w	@r13, r12
+
+	mov.b	@r3+, r10
+	shll	r0
+
+	/* 2-interwoven open main loop */
+2:	mov.b	@r3+, r6
+	shll	r10
+
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @(4,r5)
+	mov	r10, r0
+
+	mov.b	@r3+, r10
 	add	#4, r5
 
-	shll	r2
-	mov	r6, r0
-
-	/* Stall for r0 */
-
 	mov.w	@(r0,r8), r0
+	shll	r6
 
-	mov.w	r0, @r5
-	mov	r2, r0
+	mov.w	r0, @(2,r5)
+3:	mov	r6, r0
 
-	mov.b	@r3+, r6
+	/* Restore last pixel */
+	mov.w	r12, @r13
+	add	r2, r13
 
-	mov.w	@(r0,r8), r0
+	TEX2D_END_NORET()
+	mov.l	@r15+, r10
+	mov.l	@r15+, r12
+	mov.l	@r15+, r13
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8
 
-	mov.w	r0, @(2, r5)
-3:	shll	r6
-	TEX2D_END()
+_P8_RGB565.palette_distance:
+	/* Distance between image pointer and palette array base */
+	.word	260
 
 /* [Rendering strategy for the P4 format] */
 _P4:
@@ -355,3 +420,11 @@ _P4:
 2:
 3:	nop
 	TEX2D_END()
+
+/* [Unsupported formats]
+
+   P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
+_NOP:
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8
diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c
index 299cc33..fff6fab 100644
--- a/azur/src/gint/shaders/tex2d.c
+++ b/azur/src/gint/shaders/tex2d.c
@@ -17,10 +17,9 @@ void azrp_shader_tex2d_configure(void)
 
 //---
 
-/* Profile values from bopti */
+/* Profile IDs */
 #define PX_RGB565      0
 #define PX_RGB565A     1
-#define PX_P8          2
 #define PX_P4          3
 #define PX_P8_RGB565   4
 #define PX_P8_RGB565A  5
@@ -47,11 +46,11 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
     int input_multiplier = 1;
     void const *data = image->data;
 
-    if(image->profile == PX_P8 || image->profile == PX_P8_RGB565) {
+    if(image->profile == PX_P8_RGB565 || image->profile == PX_P8_RGB565A) {
         input_multiplier = 0;
-        data += 512;
+        data += (image->data[0] * 2) + 2;
     }
-    if(image->profile == PX_P4) {
+    else if(image->profile == PX_P4) {
         input_multiplier = -1;
         data += 32;
     }