azur: RGB565A in tex2d (still 4c/p)

This commit is contained in:
Lephe 2021-08-27 22:05:21 +02:00 committed by Lephenixnoir
parent 0fec6da1c4
commit 0c8efcd635
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
1 changed files with 65 additions and 63 deletions

View File

@ -1,20 +1,14 @@
.global _azrp_shader_tex2d
.align 4
/* Profile values from bopti */
#define PX_RGB565 0
#define PX_RGB565A 1
#define PX_P8 2
#define PX_P4 3
/* Register assignment
r0: (temporary)
r1: Lines
r2: Output
r2: Command queue; (temporary)
r3: Input
r4: [parameter] azrp_width*2; output stride
r5: [parameter] Command queue; (temporary)
r6: [parameter] azrp_frag; (temporary)
r5: [parameter] Command queue; Output
r6: [parameter] azrp_frag; alpha value or (temporary)
r7: Columns
r8: Input stride
r9: Image profile */
@ -23,38 +17,43 @@ _azrp_shader_tex2d:
add #2, r5
mov.l r9, @-r15
mov r5, r2
mov.w @r5+, r7 /* command.columns */
mov.w @r2+, r7 /* command.columns */
mov.l @r5+, r8 /* command.image */
mov.l @r2+, r8 /* command.image */
mov.w @r5+, r2 /* command.output (offset) */
mov.w @r2+, r5 /* command.output (offset) */
sub r7, r4
mov.w @r5+, r1 /* command.lines */
mov.w @r2+, r1 /* command.lines */
sub r7, r4
mov.w @r8+, r0 /* image.profile */
add r6, r2
add r6, r5
mov.w @r8+, r6 /* image.alpha */
cmp/eq #PX_P4, r0
mov.w @r8, r8 /* image.width */
mov.l @r5+, r3 /* command.input (pointer) */
mov.l @r2+, r3 /* command.input (pointer) */
mov r0, r2
mova .formats, r0
shll2 r2
mov.l @(r0, r2), r0
sub r7, r8
bt.s .format_P4
jmp @r0
shll r8
cmp/eq #PX_P8, r0
bt .format_P8
cmp/eq #PX_RGB565A, r0
bt .format_RGB565A
.align 4
.formats:
.long _RGB565
.long _RGB565A
.long _P8
.long _P4
/* Default below is .format_RGB565 */
@ -74,7 +73,7 @@ _azrp_shader_tex2d:
dt r1; \
#define TEX2D_END() \
add r4, r2; \
add r4, r5; \
bf.s 1b; \
add r8, r3; \
\
@ -106,105 +105,108 @@ _azrp_shader_tex2d:
tileset shader) should aim for that route though. Also, movua.l followed by
mov.l is even slower (5 cycles). */
.format_RGB565:
_RGB565:
mov #8, r0 /* Maximum width for naive method */
cmp/ge r7, r0
bt.s .naive
bt.s _RGB565.naive
mov #2, r0
/* Use naive method for opposite source/destination parity */
mov r2, r6
mov r5, r6
xor r3, r6
tst r0, r6
bf .naive
bf _RGB565.naive
shlr r7
bt .wo
bt _RGB565.wo
.we:
tst r0, r2
bf .we_do
_RGB565.we:
tst r0, r5
bf _RGB565.we_do
.we_de:
_RGB565.we_de:
TEX2D_START()
2: movs.l @r3+, x0
3: movs.l x0, @r2+
3: movs.l x0, @r5+
TEX2D_END()
.we_do:
_RGB565.we_do:
add #-1, r7
TEX2D_START()
movs.w @r3+, x0
movs.w x0, @r2+
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r2+
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r2+
movs.w x0, @r5+
TEX2D_END()
.wo:
tst r0, r2
bf .wo_do
_RGB565.wo:
tst r0, r5
bf _RGB565.wo_do
.wo_de:
_RGB565.wo_de:
TEX2D_START()
2: movs.l @r3+, x0
3: movs.l x0, @r2+
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r2+
movs.w x0, @r5+
TEX2D_END()
.wo_do:
_RGB565.wo_do:
TEX2D_START()
movs.w @r3+, x0
movs.w x0, @r2+
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r2+
3: movs.l x0, @r5+
TEX2D_END()
/* Naive method for small widths and opposite source/destination parity */
.naive:
_RGB565.naive:
TEX2D_START()
2: movs.w @r3+, x0
3: movs.w x0, @r2+
3: movs.w x0, @r5+
TEX2D_END()
/* [Rendering strategy for the RGB565A format]
Since we have to check for the alpha value in each pixel, there's really no
longword-based optimization. Instead, we just go as fast as possible with
each pixels, using DSP instructions. Branchless jump is pretty useful.
each pixels, using DSP instructions because conditional execution is pretty
damn good. This takes 4 cycles/pixel. I tried a number of reductions to
3 cycles/pixel but could not get that to work. */
TODO: Opening iterations will definitely save at least 1 cycle per pixel; it
just requires a subcase for extremely small images (width = 1). */
_RGB565A:
shll16 r6
mov #0x0004, r0 /* DC Zero mode */
.format_RGB565A:
mov r2, r5
lds r6, y0
lds r0, dsr
TEX2D_START()
/* In the comparison, DC=1 if x0 == image.alpha */
2: movs.w @r3+, x0
pcmp x0, y0 movx.w @r5, x1
dct pcopy x1, x0
3: movx.w x0, @r5+
2: movs.w @r3+, x0
pcmp x0, y0 movx.w @r5, x1
dct pcopy x1, x0
3: movx.w x0, @r5+
TEX2D_END()
/* [Rendering strategy for the P8 format] */
.format_P8:
_P8:
TEX2D_START()
2:
3:
3: nop
TEX2D_END()
/* [Rendering strategy for the P4 format] */
.format_P4:
_P4:
TEX2D_START()
2:
3:
3: nop
TEX2D_END()