azur: bugfixes and support for P4_RGB565 (3.5 c/p)

2021-09-27 10:04:00 +02:00 · 2021-09-27 10:04:00 +02:00 · 52a66402aa
parent c16b1a85c6
commit 52a66402aa
2 changed files with 153 additions and 58 deletions
--- a/azur/src/gint/shaders/tex2d.S
+++ b/azur/src/gint/shaders/tex2d.S
@ -44,8 +44,7 @@ _azrp_shader_tex2d:
 	mov.w	@r8+, r9    /* image.width */

 	jmp	@r0
-	/* Stall for r9 */
-	sub	r7, r9
+	nop

 .align 4
 .formats:
@ -108,6 +107,8 @@ _azrp_shader_tex2d:
 .align 4
 _RGB565:
 	mov	#8, r0      /* Maximum width for naive method */
+	sub	r7, r9
+
 	cmp/ge	r7, r0

 	shll	r9
@ -118,6 +119,7 @@ _RGB565:
 	/* Use naive method for opposite source/destination parity */
 	mov	r5, r6
 	xor	r3, r6
+
 	tst	r0, r6
 	bf	_RGB565.naive

@ -189,6 +191,8 @@ _RGB565A:
 	shll16	r6
 	mov	#0x0004, r0 /* DC Zero mode */

+	sub	r7, r9
+
 	shll	r9

 	lds	r6, y0
@ -245,18 +249,20 @@ _RGB565A:
 .align 4
 _P8_RGB565A:
 	mov.l	r13, @-r15
-	add	#-2, r9 /* Input stride compensation for openness */
+	sub	r7, r9

 	mov	r7, r13
-	shlr	r7
+	add	#-2, r9 /* Input stride compensation for openness */

 	mov.l	r12, @-r15
-	movt	r6
+	shlr	r7

 	mov.l	r10, @-r15
-	shll	r13
+	movt	r6

 	mov.w	_P8_RGB565A.palette_distance, r0
+	shll	r13
+
 	add	r6, r7

 	sub	r6, r9
@ -337,18 +343,20 @@ _P8_RGB565A.palette_distance:
 .align 4
 _P8_RGB565:
 	mov.l	r13, @-r15
-	add	#-2, r9 /* Input stride compensation for openness */
+	sub	r7, r9

 	mov	r7, r13
-	shlr	r7
+	add	#-2, r9 /* Input stride compensation for openness */

 	mov.l	r12, @-r15
-	movt	r6
+	shlr	r7

 	mov.l	r10, @-r15
-	shll	r13
+	movt	r6

 	mov.w	_P8_RGB565.palette_distance, r0
+	shll	r13
+
 	add	r6, r7

 	sub	r6, r9
@ -437,23 +445,32 @@ _P8_RGB565.palette_distance:
   Otherwise, they point to the next pixels and the restores are no-ops. See
   the strategy used for managing interweaving in P8 formats for details.

-   TODO: Asymptotic performance */
+   The only irregularity is image width, which the command builder cannot
+   modify. It is rounded up to the next multiple of 2, then halved. There is a
+   nice trick for this operation, which is [shlr rX] then adding T to rX. We
+   also need to add -1 for another adjustement, and both are combined into an
+   addc, which saves one add and one movt off the EX critical chain.
+
+   The main loop achieves 5 cycles/pixel. */
 .align 4
 _P4_RGB565A:
-	mov.l	r10, @-r15
 	shlr	r9
+	mov	#-1, r0
+
+	mov.l	r10, @-r15
+	addc	r0, r9

 	mov.l	r11, @-r15
-	add	#-1, r9 /* Input stride compensation for openness */
-
-	mov.l	r12, @-r15
-	add	#2, r8 /* image.palette */
-
-	mov.w	@r2+, r11 /* command.edge1 */
 	shlr	r7

+	mov.l	r12, @-r15
+	sub	r7, r9
+
+	mov.w	@r2+, r11 /* command.edge1 */
+	add	#2, r8 /* image.palette */
+
 	mov.w	@r2+, r12 /* command.edge2 */
-	mov	r5, r10
+	mov	r5, r0

 	mov.l	r13, @-r15
 	shll	r11
@ -461,62 +478,59 @@ _P4_RGB565A:
 	mov.l	r14, @-r15
 	shll	r12

+	add	#-4, r5
+
 	TEX2D_START()

-	mov	r10, r0
 	mov.b	@r3+, r6
-
-	/* Stall for r0 */
+	mov	r0, r10

 	mov.w	@(r0,r11), r13

 	mov.w	@(r0,r12), r14
-
-	/* Main loop with 2 pixels sharing a single byte */
-
-2:	/* Stall for r6 */
-
 	shll	r6

-	mov	r6, r0
+	/* Main loop with 2 pixels sharing a single byte */
+2:	mov	r6, r0
 	and	#0x1e, r0

 	tst	r0, r0

-	bt	4f
-	mov.w	@(r0,r8), r0
-
-	mov.w	r0, @(2,r5)
-     4:	shlr2	r6
-
+	bt.s	4f
 	shlr2	r6

+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @(6,r5)
+     4:	shlr2	r6
+
 	mov	r6, r0
 	and	#0x1e, r0

 	tst	r0, r0
+	mov.b	@r3+, r6
+
+	bt.s	5f
+	add	#4, r5

-	bt	5f
 	mov.w	@(r0,r8), r0

 	mov.w	r0, @r5
-
-     5: mov.b	@r3+, r6
-3:	add	#4, r5
+3:   5: shll	r6

 	mov	r10, r0
-	add	r7, r10
+	mov	r7, r10

-	/* Stall for r0 */
+	shll2	r10

 	mov.w	r13, @(r0,r11)
-	add	r7, r10
-
-	mov.w	r14, @(r0,r12)
 	add	r4, r10

-	add	r7, r10
-	add	r7, r10
+	mov.w	r14, @(r0,r12)
+	add	r0, r10
+
+	mov	r10, r0
+	/* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */

 	TEX2D_END_NORET()
 	mov.l	@r15+, r14
@ -529,13 +543,92 @@ _P4_RGB565A:
 	mov.l	@r15+, r8

 /* [Rendering strategy for the P4_RGB565 format]
-   Same as P4_RGB565A without transparency checks (fairly straightforward). */
+   Same as P4_RGB565A without transparency checks (fairly straightforward). The
+   core loop runs in 3.5 cycles/pixel. */
 .align 4
 _P4_RGB565:
+	shlr	r9
+	mov	#-1, r0
+
+	mov.l	r10, @-r15
+	addc	r0, r9
+
+	mov.l	r11, @-r15
+	shlr	r7
+
+	mov.l	r12, @-r15
+	sub	r7, r9
+
+	mov.w	@r2+, r11 /* command.edge1 */
+	add	#2, r8 /* image.palette */
+
+	mov.w	@r2+, r12 /* command.edge2 */
+	mov	r5, r0
+
+	mov.l	r13, @-r15
+	shll	r11
+
+	mov.l	r14, @-r15
+	shll	r12
+
+	add	#-4, r5
+	mov	#0x1e, r2
+
 	TEX2D_START()
-2:
-3:	nop
-	TEX2D_END()
+
+	mov.b	@r3+, r6
+	mov	#-4, r10
+
+	mov.l	r0, @-r15
+
+	mov.w	@(r0,r11), r13
+
+	mov.w	@(r0,r12), r14
+	shll	r6
+
+	/* Main loop with 2 pixels sharing a single byte */
+2:	mov	r6, r0
+	and	#0x1e, r0
+
+	shld	r10, r6
+
+	mov.w	@(r0,r8), r0
+	and	r2, r6
+
+	mov.w	r0, @(6,r5)
+	mov	r6, r0
+
+	mov.b	@r3+, r6
+	add	#4, r5
+
+	mov.w	@(r0,r8), r0
+
+	mov.w	r0, @r5
+3:	shll	r6
+
+	mov.l	@r15+, r0
+	mov	r7, r10
+
+	shll2	r10
+
+	mov.w	r13, @(r0,r11)
+	add	r4, r10
+
+	mov.w	r14, @(r0,r12)
+	add	r0, r10
+
+	mov	r10, r0
+	/* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */
+
+	TEX2D_END_NORET()
+	mov.l	@r15+, r14
+	mov.l	@r15+, r13
+	mov.l	@r15+, r12
+	mov.l	@r15+, r11
+	mov.l	@r15+, r10
+	mov.l	@r15+, r9
+	rts
+	mov.l	@r15+, r8

 /* [Unsupported formats]
   P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
--- a/azur/src/gint/shaders/tex2d.c
+++ b/azur/src/gint/shaders/tex2d.c
@ -44,17 +44,17 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
    cmd.columns = width;
    cmd.image = image;

-    int input_multiplier = 1;
-    void const *data = image->data;
+    int row_stride;
    size_t cmd_size = sizeof cmd - 4;

    if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
-        input_multiplier = 0;
-        data += (image->data[0] * 2) + 2;
+        row_stride = image->width;
+        cmd.input = (void *)image->data + (image->data[0] * 2) + 2 +
+            top * row_stride + left;
    }
    else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
-        input_multiplier = -1;
-        data += 32;
+        row_stride = (image->width + 1) >> 1;
+        cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1);

        int odd_left  = left & 1;
        int odd_right = (left + width) & 1;
@ -65,6 +65,10 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
        x -= odd_left;
        cmd_size += 4;
    }
+    else {
+        row_stride = image->width << 1;
+        cmd.input = (void *)image->data + top * row_stride + (left << 1);
+    }

    /* This divides by azrp_frag_height */
    cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
@ -72,9 +76,6 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
    while(height > 0) {
        cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));

-        int input_offset = image->width * top + left;
-        input_offset = (input_offset << (input_multiplier + 1)) >> 1;
-        cmd.input = data + input_offset;
        cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);

        y += cmd.lines;
@ -83,6 +84,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,

        azrp_queue_command(&cmd, cmd_size);
        cmd.fragment_id++;
+        cmd.input += row_stride * cmd.lines;
    }

    prof_leave(azrp_perf_cmdgen);