From 0c8efcd63502a4e5b650e2991fb5b11d223563fb Mon Sep 17 00:00:00 2001 From: Lephe Date: Fri, 27 Aug 2021 22:05:21 +0200 Subject: [PATCH] azur: RGB565A in tex2d (still 4c/p) --- azur/src/gint/shaders/tex2d.S | 128 +++++++++++++++++----------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S index f71a67a..315d223 100644 --- a/azur/src/gint/shaders/tex2d.S +++ b/azur/src/gint/shaders/tex2d.S @@ -1,20 +1,14 @@ .global _azrp_shader_tex2d .align 4 -/* Profile values from bopti */ -#define PX_RGB565 0 -#define PX_RGB565A 1 -#define PX_P8 2 -#define PX_P4 3 - /* Register assignment r0: (temporary) r1: Lines - r2: Output + r2: Command queue; (temporary) r3: Input r4: [parameter] azrp_width*2; output stride - r5: [parameter] Command queue; (temporary) - r6: [parameter] azrp_frag; (temporary) + r5: [parameter] Command queue; Output + r6: [parameter] azrp_frag; alpha value or (temporary) r7: Columns r8: Input stride r9: Image profile */ @@ -23,38 +17,43 @@ _azrp_shader_tex2d: add #2, r5 mov.l r9, @-r15 + mov r5, r2 - mov.w @r5+, r7 /* command.columns */ + mov.w @r2+, r7 /* command.columns */ - mov.l @r5+, r8 /* command.image */ + mov.l @r2+, r8 /* command.image */ - mov.w @r5+, r2 /* command.output (offset) */ + mov.w @r2+, r5 /* command.output (offset) */ sub r7, r4 - mov.w @r5+, r1 /* command.lines */ + mov.w @r2+, r1 /* command.lines */ sub r7, r4 mov.w @r8+, r0 /* image.profile */ - add r6, r2 + add r6, r5 mov.w @r8+, r6 /* image.alpha */ - cmp/eq #PX_P4, r0 mov.w @r8, r8 /* image.width */ - mov.l @r5+, r3 /* command.input (pointer) */ + mov.l @r2+, r3 /* command.input (pointer) */ + mov r0, r2 + mova .formats, r0 + shll2 r2 + + mov.l @(r0, r2), r0 sub r7, r8 - bt.s .format_P4 + jmp @r0 shll r8 - cmp/eq #PX_P8, r0 - - bt .format_P8 - cmp/eq #PX_RGB565A, r0 - - bt .format_RGB565A +.align 4 +.formats: + .long _RGB565 + .long _RGB565A + .long _P8 + .long _P4 /* Default below is .format_RGB565 */ @@ -74,7 +73,7 @@ _azrp_shader_tex2d: dt r1; \ #define TEX2D_END() \ - add r4, r2; \ + add r4, r5; \ bf.s 1b; \ add r8, r3; \ \ @@ -106,105 +105,108 @@ _azrp_shader_tex2d: tileset shader) should aim for that route though. Also, movua.l followed by mov.l is even slower (5 cycles). */ -.format_RGB565: +_RGB565: mov #8, r0 /* Maximum width for naive method */ cmp/ge r7, r0 - bt.s .naive + bt.s _RGB565.naive mov #2, r0 /* Use naive method for opposite source/destination parity */ - mov r2, r6 + mov r5, r6 xor r3, r6 tst r0, r6 - bf .naive + bf _RGB565.naive shlr r7 - bt .wo + bt _RGB565.wo -.we: - tst r0, r2 - bf .we_do +_RGB565.we: + tst r0, r5 + bf _RGB565.we_do -.we_de: +_RGB565.we_de: TEX2D_START() 2: movs.l @r3+, x0 -3: movs.l x0, @r2+ +3: movs.l x0, @r5+ TEX2D_END() -.we_do: +_RGB565.we_do: add #-1, r7 TEX2D_START() movs.w @r3+, x0 - movs.w x0, @r2+ + movs.w x0, @r5+ 2: movs.l @r3+, x0 -3: movs.l x0, @r2+ +3: movs.l x0, @r5+ movs.w @r3+, x0 - movs.w x0, @r2+ + movs.w x0, @r5+ TEX2D_END() -.wo: - tst r0, r2 - bf .wo_do +_RGB565.wo: + tst r0, r5 + bf _RGB565.wo_do -.wo_de: +_RGB565.wo_de: TEX2D_START() 2: movs.l @r3+, x0 -3: movs.l x0, @r2+ +3: movs.l x0, @r5+ movs.w @r3+, x0 - movs.w x0, @r2+ + movs.w x0, @r5+ TEX2D_END() -.wo_do: +_RGB565.wo_do: TEX2D_START() movs.w @r3+, x0 - movs.w x0, @r2+ + movs.w x0, @r5+ 2: movs.l @r3+, x0 -3: movs.l x0, @r2+ +3: movs.l x0, @r5+ TEX2D_END() /* Naive method for small widths and opposite source/destination parity */ -.naive: +_RGB565.naive: TEX2D_START() 2: movs.w @r3+, x0 -3: movs.w x0, @r2+ +3: movs.w x0, @r5+ TEX2D_END() /* [Rendering strategy for the RGB565A format] Since we have to check for the alpha value in each pixel, there's really no longword-based optimization. Instead, we just go as fast as possible with - each pixels, using DSP instructions. Branchless jump is pretty useful. + each pixels, using DSP instructions because conditional execution is pretty + damn good. This takes 4 cycles/pixel. I tried a number of reductions to + 3 cycles/pixel but could not get that to work. */ - TODO: Opening iterations will definitely save at least 1 cycle per pixel; it - just requires a subcase for extremely small images (width = 1). */ +_RGB565A: + shll16 r6 + mov #0x0004, r0 /* DC Zero mode */ -.format_RGB565A: - mov r2, r5 + lds r6, y0 + + lds r0, dsr TEX2D_START() - /* In the comparison, DC=1 if x0 == image.alpha */ -2: movs.w @r3+, x0 - pcmp x0, y0 movx.w @r5, x1 - dct pcopy x1, x0 -3: movx.w x0, @r5+ +2: movs.w @r3+, x0 + pcmp x0, y0 movx.w @r5, x1 + dct pcopy x1, x0 +3: movx.w x0, @r5+ TEX2D_END() /* [Rendering strategy for the P8 format] */ -.format_P8: +_P8: TEX2D_START() 2: -3: +3: nop TEX2D_END() /* [Rendering strategy for the P4 format] */ -.format_P4: +_P4: TEX2D_START() 2: -3: +3: nop TEX2D_END()