From 0c8efcd63502a4e5b650e2991fb5b11d223563fb Mon Sep 17 00:00:00 2001
From: Lephe <sebastien.michelland@protonmail.com>
Date: Fri, 27 Aug 2021 22:05:21 +0200
Subject: [PATCH] azur: RGB565A in tex2d (still 4c/p)

---
 azur/src/gint/shaders/tex2d.S | 128 +++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 63 deletions(-)

diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S
index f71a67a..315d223 100644
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@@ -1,20 +1,14 @@
 .global _azrp_shader_tex2d
 .align 4
 
-/* Profile values from bopti */
-#define PX_RGB565   0
-#define PX_RGB565A  1
-#define PX_P8       2
-#define PX_P4       3
-
 /* Register assignment
    r0: (temporary)
    r1: Lines
-   r2: Output
+   r2: Command queue; (temporary)
    r3: Input
    r4: [parameter] azrp_width*2; output stride
-   r5: [parameter] Command queue; (temporary)
-   r6: [parameter] azrp_frag; (temporary)
+   r5: [parameter] Command queue; Output
+   r6: [parameter] azrp_frag; alpha value or (temporary)
    r7: Columns
    r8: Input stride
    r9: Image profile */
@@ -23,38 +17,43 @@ _azrp_shader_tex2d:
 	add	#2, r5
 
 	mov.l	r9, @-r15
+	mov	r5, r2
 
-	mov.w	@r5+, r7    /* command.columns */
+	mov.w	@r2+, r7    /* command.columns */
 
-	mov.l	@r5+, r8    /* command.image */
+	mov.l	@r2+, r8    /* command.image */
 
-	mov.w	@r5+, r2    /* command.output (offset) */
+	mov.w	@r2+, r5    /* command.output (offset) */
 	sub	r7, r4
 
-	mov.w	@r5+, r1    /* command.lines */
+	mov.w	@r2+, r1    /* command.lines */
 	sub	r7, r4
 
 	mov.w	@r8+, r0    /* image.profile */
-	add	r6, r2
+	add	r6, r5
 
 	mov.w	@r8+, r6    /* image.alpha */
-	cmp/eq	#PX_P4, r0
 
 	mov.w	@r8, r8     /* image.width */
 
-	mov.l	@r5+, r3    /* command.input (pointer) */
+	mov.l	@r2+, r3    /* command.input (pointer) */
+	mov	r0, r2
 
+	mova	.formats, r0
+	shll2	r2
+
+	mov.l	@(r0, r2), r0
 	sub	r7, r8
 
-	bt.s	.format_P4
+	jmp	@r0
 	shll	r8
 
-	cmp/eq	#PX_P8, r0
-
-	bt	.format_P8
-	cmp/eq	#PX_RGB565A, r0
-
-	bt	.format_RGB565A
+.align 4
+.formats:
+	.long	_RGB565
+	.long	_RGB565A
+	.long	_P8
+	.long	_P4
 
 	/* Default below is .format_RGB565 */
 
@@ -74,7 +73,7 @@ _azrp_shader_tex2d:
 	dt	r1;		\
 
 #define TEX2D_END()		\
-	add	r4, r2;		\
+	add	r4, r5;		\
 	bf.s	1b;		\
 	add	r8, r3;		\
 				\
@@ -106,105 +105,108 @@ _azrp_shader_tex2d:
    tileset shader) should aim for that route though. Also, movua.l followed by
    mov.l is even slower (5 cycles). */
 
-.format_RGB565:
+_RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
 	cmp/ge	r7, r0
 
-	bt.s	.naive
+	bt.s	_RGB565.naive
 	mov	#2, r0
 
 	/* Use naive method for opposite source/destination parity */
-	mov	r2, r6
+	mov	r5, r6
 	xor	r3, r6
 	tst	r0, r6
-	bf	.naive
+	bf	_RGB565.naive
 
 	shlr	r7
-	bt	.wo
+	bt	_RGB565.wo
 
-.we:
-	tst	r0, r2
-	bf	.we_do
+_RGB565.we:
+	tst	r0, r5
+	bf	_RGB565.we_do
 
-.we_de:
+_RGB565.we_de:
 	TEX2D_START()
 2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r2+
+3:	movs.l	x0, @r5+
 	TEX2D_END()
 
-.we_do:
+_RGB565.we_do:
 	add	#-1, r7
 
 	TEX2D_START()
 	movs.w	@r3+, x0
-	movs.w	x0, @r2+
+	movs.w	x0, @r5+
 
 2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r2+
+3:	movs.l	x0, @r5+
 
 	movs.w	@r3+, x0
-	movs.w	x0, @r2+
+	movs.w	x0, @r5+
 	TEX2D_END()
 
-.wo:
-	tst	r0, r2
-	bf	.wo_do
+_RGB565.wo:
+	tst	r0, r5
+	bf	_RGB565.wo_do
 
-.wo_de:
+_RGB565.wo_de:
 	TEX2D_START()
 2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r2+
+3:	movs.l	x0, @r5+
 
 	movs.w	@r3+, x0
-	movs.w	x0, @r2+
+	movs.w	x0, @r5+
 	TEX2D_END()
 
-.wo_do:
+_RGB565.wo_do:
 	TEX2D_START()
 	movs.w	@r3+, x0
-	movs.w	x0, @r2+
+	movs.w	x0, @r5+
 
 2:	movs.l	@r3+, x0
-3:	movs.l	x0, @r2+
+3:	movs.l	x0, @r5+
 	TEX2D_END()
 
 /* Naive method for small widths and opposite source/destination parity */
-.naive:
+_RGB565.naive:
 	TEX2D_START()
 2:	movs.w	@r3+, x0
-3:	movs.w	x0, @r2+
+3:	movs.w	x0, @r5+
 	TEX2D_END()
 
 /* [Rendering strategy for the RGB565A format]
 
    Since we have to check for the alpha value in each pixel, there's really no
    longword-based optimization. Instead, we just go as fast as possible with
-   each pixels, using DSP instructions. Branchless jump is pretty useful.
+   each pixels, using DSP instructions because conditional execution is pretty
+   damn good. This takes 4 cycles/pixel. I tried a number of reductions to
+   3 cycles/pixel but could not get that to work. */
 
-   TODO: Opening iterations will definitely save at least 1 cycle per pixel; it
-         just requires a subcase for extremely small images (width = 1). */
+_RGB565A:
+	shll16	r6
+	mov	#0x0004, r0 /* DC Zero mode */
 
-.format_RGB565A:
-	mov	r2, r5
+	lds	r6, y0
+
+	lds	r0, dsr
 
 	TEX2D_START()
-	/* In the comparison, DC=1 if x0 == image.alpha */
-2:	                         movs.w  @r3+, x0
-	     pcmp    x0, y0      movx.w  @r5, x1
-	dct  pcopy   x1, x0
-3:	     movx.w  x0, @r5+
+2:	                        movs.w  @r3+, x0
+	    pcmp    x0, y0      movx.w  @r5, x1
+	dct pcopy   x1, x0
+3:	                        movx.w  x0, @r5+
 	TEX2D_END()
 
 /* [Rendering strategy for the P8 format] */
-.format_P8:
+_P8:
 	TEX2D_START()
 2:
-3:
+3:	nop
 	TEX2D_END()
 
 /* [Rendering strategy for the P4 format] */
-.format_P4:
+_P4:
 	TEX2D_START()
 2:
-3:
+3:	nop
 	TEX2D_END()