From 18ee037693878cc7683e5756c2ab46c14764ddf1 Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Thu, 23 Sep 2021 16:19:12 +0200
Subject: [PATCH] azur: support for P8 in tex2d (5.5 cycles/pixel)

---
 azur/src/gint/shaders/tex2d.S | 187 ++++++++++++++++++++++++++++++----
 azur/src/gint/shaders/tex2d.c |  25 +++--
 2 files changed, 183 insertions(+), 29 deletions(-)

diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index 315d223..8922df8 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -8,10 +8,10 @@
    r3: Input
    r4: [parameter] azrp_width*2; output stride
    r5: [parameter] Command queue; Output
-   r6: [parameter] azrp_frag; alpha value or (temporary)
+   r6: [parameter] azrp_frag; alpha value; (temporary)
    r7: Columns
-   r8: Input stride
-   r9: Image profile */
+   r8: Image pointer; (temporary)
+   r9: Input stride */
 _azrp_shader_tex2d:
 	mov.l	r8, @-r15
 	add	#2, r5
@@ -34,7 +34,7 @@ _azrp_shader_tex2d:
 
 	mov.w	@r8+, r6    /* image.alpha */
 
-	mov.w	@r8, r8     /* image.width */
+	mov.w	@r8+, r9    /* image.width */
 
 	mov.l	@r2+, r3    /* command.input (pointer) */
 	mov	r0, r2
@@ -42,11 +42,12 @@ _azrp_shader_tex2d:
 	mova	.formats, r0
 	shll2	r2
 
+	/* Stall cycle */
+
 	mov.l	@(r0, r2), r0
-	sub	r7, r8
 
 	jmp	@r0
-	shll	r8
+	sub	r7, r9
 
 .align 4
 .formats:
@@ -54,29 +55,29 @@ _azrp_shader_tex2d:
 	.long	_RGB565A
 	.long	_P8
 	.long	_P4
-
-	/* Default below is .format_RGB565 */
+	.long	_P8_RGB565
 
 /* [Loop macros]
 
    The following macros implement the main loop of the image renderer.
    * Each line is rendered in the tight loop between 2: and 3: (both included).
-   * r2 is the output (with stride r4, in bytes)
-   * r3 is the input (with stride r8, in bytes)
+   * r5 is the output (with stride r4, in bytes)
+   * r3 is the input (with stride r9, in bytes)
    * There are r1 rows with r7 iterations each */
 
 #define TEX2D_START()		\
 	ldrs	2f;		\
 	ldre	3f;		\
-				\
-1:	ldrc	r7;		\
-	dt	r1;		\
+1:	ldrc	r7
 
-#define TEX2D_END()		\
+#define TEX2D_END_NORET()	\
+	dt	r1;		\
 	add	r4, r5;		\
 	bf.s	1b;		\
-	add	r8, r3;		\
-				\
+	add	r9, r3
+
+#define TEX2D_END()		\
+	TEX2D_END_NORET();	\
 	mov.l	@r15+, r9;	\
 	rts;			\
 	mov.l	@r15+, r8
@@ -94,7 +95,7 @@ _azrp_shader_tex2d:
    When the destination and source have identical parity, the d[eo] variation
    can be defined. In this case the copy is pretty direct, it's a longword copy
    and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
-   start or end address if 2-aligned.
+   start or end address is 2-aligned.
 
    However, when they have opposite parity, each longword read matches up with
    a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
@@ -109,6 +110,8 @@ _RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
 	cmp/ge	r7, r0
 
+	shll	r9
+
 	bt.s	_RGB565.naive
 	mov	#2, r0
 
@@ -178,7 +181,7 @@ _RGB565.naive:
 
    Since we have to check for the alpha value in each pixel, there's really no
    longword-based optimization. Instead, we just go as fast as possible with
-   each pixels, using DSP instructions because conditional execution is pretty
+   each pixel, using DSP instructions because conditional execution is pretty
    damn good. This takes 4 cycles/pixel. I tried a number of reductions to
    3 cycles/pixel but could not get that to work. */
 
@@ -186,6 +189,8 @@ _RGB565A:
 	shll16	r6
 	mov	#0x0004, r0 /* DC Zero mode */
 
+	shll	r9
+
 	lds	r6, y0
 
 	lds	r0, dsr
@@ -197,11 +202,151 @@ _RGB565A:
 3:	                        movx.w  x0, @r5+
 	TEX2D_END()
 
-/* [Rendering strategy for the P8 format] */
+/* [Rendering strategy for the P8 format]
+
+   The work needed for each pixel gets more difficult as we go. In P8 there is
+   both a palette indexing step (which induces some latency when moving values
+   read from memory to the ALU, unlike RGB565), and an alpha comparison check.
+
+   The rendering uses a 2-interwoven open loop. This reduces stall cycles and
+   increases parallelism. Dealing with non-multiple widths is annoying as
+   usual. Instead this routine avoids the clipping problem by overwriting then
+   restoring the next pixel. (A delightfully smart workaround if you ask me.)
+
+   Unless I have missed something this routine achieves 5.5 cycles/pixel.
+
+   The format is not extremely friendly. It has alpha for all images, and uses
+   a non-zero value for it, which burns a register. Palette indices are
+   unsigned, which requires an extu.b even though the palette could be indexed
+   with signed values by moving the pointer. Also the palette always takes up
+   512 bytes even when a low amount of colors is used.
+
+   The P8_RGB565 and P8_RGB565A address these issues and supplant P8. In the
+   interim this version of P8 is reasonably elegant despite ample extra
+   registers and initial computations. */
 _P8:
+	mov.l	r13, @-r15
+	add	#2, r8 /* Palette */
+
+	mov	r7, r13
+	shlr	r7
+
+	mov.l	r12, @-r15
+	movt	r12
+
+	mov.l	r11, @-r15
+	add	r12, r7
+
+	mov.l	r10, @-r15
+	extu.b	r6, r6
+
+	shll	r13
+
+	add	#-1, r9
+
+	sub	r12, r9
+
+	sub	r12, r4
+
+	sub	r12, r4
+
+	add	r5, r13
+
 	TEX2D_START()
-2:
-3:	nop
+
+	/* Save the first pixel after the line. It will be restored at the end
+	   of the line to correct the odd-width case where the 2-interwoven
+	   main loop writes an additional pixel. */
+	mov.w	@r13, r12
+
+	mov.b	@r3+, r0
+
+2:	/* 2-interwoven open main loop */
+	mov.b	@r3+, r10
+	extu.b	r0, r0
+
+	cmp/eq	r0, r6
+	mov.w	@r5+, r2
+
+	add	r0, r0 /* Don't use shll to keep T */
+	mov.w	@r5, r11
+
+	add	#-2, r5
+	bt.s	5f
+
+	extu.b	r10, r10
+	mov.w	@(r0,r8), r2
+
+     5:	cmp/eq	r10, r6
+
+	add	r10, r10 /* Don't use shll to keep T */
+	mov	r10, r0
+
+	mov.w	r2, @r5
+	add	#2, r5
+
+	bt	6f
+	mov.w	@(r0,r8), r11
+
+     6:	mov.b	@r3+, r0
+
+	mov.w	r11, @r5
+3:	add	#2, r5
+
+	/* Restore last pixel */
+	mov.w	r12, @r13
+	add	r4, r13
+
+	mov	r7, r6
+	shll2	r6
+
+	add	r6, r13
+
+	TEX2D_END_NORET()
+	mov.l	@r15+, r10
+	mov.l	@r15+, r11
+	mov.l	@r15+, r12
+	mov.l	@r15+, r13
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8
+
+/* [Rendering strategy for the P8 RGB565 format]
+
+   This format does not support alpha, lifting the requirement for comparisons,
+   branches and some register logic. The palette is also designed to support
+   signed indices (from -128 to 127). The interwoven setup becomes much more
+   practical as a result. */
+
+_P8_RGB565:
+	shlr	r7
+	/* TODO: Odd case */
+
+	mov.b	@r3+, r6
+	add	#-4, r5
+
+	shll	r6
+
+	TEX2D_START()
+2:	mov.b	@r3+, r2
+	add	#4, r5
+
+	shll	r2
+	mov	r6, r0
+
+	/* Stall for r0 */
+
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @r5
+	mov	r2, r0
+
+	mov.b	@r3+, r6
+
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @(2, r5)
+3:	shll	r6
 	TEX2D_END()
 
 /* [Rendering strategy for the P4 format] */
diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c
index 5f2f9e9..299cc33 100644
--- a/azur/src/gint/shaders/tex2d.c
+++ b/azur/src/gint/shaders/tex2d.c
@@ -18,10 +18,12 @@ void azrp_shader_tex2d_configure(void)
 //---
 
 /* Profile values from bopti */
-#define PX_RGB565   0
-#define PX_RGB565A  1
-#define PX_P8       2
-#define PX_P4       3
+#define PX_RGB565      0
+#define PX_RGB565A     1
+#define PX_P8          2
+#define PX_P4          3
+#define PX_P8_RGB565   4
+#define PX_P8_RGB565A  5
 
 void azrp_image(int x, int y, bopti_image_t const *image)
 {
@@ -43,8 +45,16 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
     cmd.image = image;
 
     int input_multiplier = 1;
-    if(image->profile == PX_P8) input_multiplier = 0;
-    if(image->profile == PX_P4) input_multiplier = -1;
+    void const *data = image->data;
+
+    if(image->profile == PX_P8 || image->profile == PX_P8_RGB565) {
+        input_multiplier = 0;
+        data += 512;
+    }
+    if(image->profile == PX_P4) {
+        input_multiplier = -1;
+        data += 32;
+    }
 
     /* This divides by azrp_frag_height */
     cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
@@ -53,8 +63,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
         cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
 
         int input_offset = (image->width * top + left) << input_multiplier;
-        cmd.input = (void *)image->data + input_offset;
-
+        cmd.input = data + input_offset;
         cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
 
         y += cmd.lines;