diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S index f766ed7..67d3c87 100644 --- a/azur/src/gint/shaders/tex2d.S +++ b/azur/src/gint/shaders/tex2d.S @@ -44,8 +44,7 @@ _azrp_shader_tex2d: mov.w @r8+, r9 /* image.width */ jmp @r0 - /* Stall for r9 */ - sub r7, r9 + nop .align 4 .formats: @@ -108,6 +107,8 @@ _azrp_shader_tex2d: .align 4 _RGB565: mov #8, r0 /* Maximum width for naive method */ + sub r7, r9 + cmp/ge r7, r0 shll r9 @@ -118,6 +119,7 @@ _RGB565: /* Use naive method for opposite source/destination parity */ mov r5, r6 xor r3, r6 + tst r0, r6 bf _RGB565.naive @@ -189,6 +191,8 @@ _RGB565A: shll16 r6 mov #0x0004, r0 /* DC Zero mode */ + sub r7, r9 + shll r9 lds r6, y0 @@ -245,18 +249,20 @@ _RGB565A: .align 4 _P8_RGB565A: mov.l r13, @-r15 - add #-2, r9 /* Input stride compensation for openness */ + sub r7, r9 mov r7, r13 - shlr r7 + add #-2, r9 /* Input stride compensation for openness */ mov.l r12, @-r15 - movt r6 + shlr r7 mov.l r10, @-r15 - shll r13 + movt r6 mov.w _P8_RGB565A.palette_distance, r0 + shll r13 + add r6, r7 sub r6, r9 @@ -337,18 +343,20 @@ _P8_RGB565A.palette_distance: .align 4 _P8_RGB565: mov.l r13, @-r15 - add #-2, r9 /* Input stride compensation for openness */ + sub r7, r9 mov r7, r13 - shlr r7 + add #-2, r9 /* Input stride compensation for openness */ mov.l r12, @-r15 - movt r6 + shlr r7 mov.l r10, @-r15 - shll r13 + movt r6 mov.w _P8_RGB565.palette_distance, r0 + shll r13 + add r6, r7 sub r6, r9 @@ -437,23 +445,32 @@ _P8_RGB565.palette_distance: Otherwise, they point to the next pixels and the restores are no-ops. See the strategy used for managing interweaving in P8 formats for details. - TODO: Asymptotic performance */ + The only irregularity is image width, which the command builder cannot + modify. It is rounded up to the next multiple of 2, then halved. There is a + nice trick for this operation, which is [shlr rX] then adding T to rX. We + also need to add -1 for another adjustement, and both are combined into an + addc, which saves one add and one movt off the EX critical chain. + + The main loop achieves 5 cycles/pixel. */ .align 4 _P4_RGB565A: - mov.l r10, @-r15 shlr r9 + mov #-1, r0 + + mov.l r10, @-r15 + addc r0, r9 mov.l r11, @-r15 - add #-1, r9 /* Input stride compensation for openness */ - - mov.l r12, @-r15 - add #2, r8 /* image.palette */ - - mov.w @r2+, r11 /* command.edge1 */ shlr r7 + mov.l r12, @-r15 + sub r7, r9 + + mov.w @r2+, r11 /* command.edge1 */ + add #2, r8 /* image.palette */ + mov.w @r2+, r12 /* command.edge2 */ - mov r5, r10 + mov r5, r0 mov.l r13, @-r15 shll r11 @@ -461,62 +478,59 @@ _P4_RGB565A: mov.l r14, @-r15 shll r12 + add #-4, r5 + TEX2D_START() - mov r10, r0 mov.b @r3+, r6 - - /* Stall for r0 */ + mov r0, r10 mov.w @(r0,r11), r13 mov.w @(r0,r12), r14 - - /* Main loop with 2 pixels sharing a single byte */ - -2: /* Stall for r6 */ - shll r6 - mov r6, r0 + /* Main loop with 2 pixels sharing a single byte */ +2: mov r6, r0 and #0x1e, r0 tst r0, r0 - bt 4f - mov.w @(r0,r8), r0 - - mov.w r0, @(2,r5) - 4: shlr2 r6 - + bt.s 4f shlr2 r6 + mov.w @(r0,r8), r0 + + mov.w r0, @(6,r5) + 4: shlr2 r6 + mov r6, r0 and #0x1e, r0 tst r0, r0 + mov.b @r3+, r6 + + bt.s 5f + add #4, r5 - bt 5f mov.w @(r0,r8), r0 mov.w r0, @r5 - - 5: mov.b @r3+, r6 -3: add #4, r5 +3: 5: shll r6 mov r10, r0 - add r7, r10 + mov r7, r10 - /* Stall for r0 */ + shll2 r10 mov.w r13, @(r0,r11) - add r7, r10 - - mov.w r14, @(r0,r12) add r4, r10 - add r7, r10 - add r7, r10 + mov.w r14, @(r0,r12) + add r0, r10 + + mov r10, r0 + /* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */ TEX2D_END_NORET() mov.l @r15+, r14 @@ -529,13 +543,92 @@ _P4_RGB565A: mov.l @r15+, r8 /* [Rendering strategy for the P4_RGB565 format] - Same as P4_RGB565A without transparency checks (fairly straightforward). */ + Same as P4_RGB565A without transparency checks (fairly straightforward). The + core loop runs in 3.5 cycles/pixel. */ .align 4 _P4_RGB565: + shlr r9 + mov #-1, r0 + + mov.l r10, @-r15 + addc r0, r9 + + mov.l r11, @-r15 + shlr r7 + + mov.l r12, @-r15 + sub r7, r9 + + mov.w @r2+, r11 /* command.edge1 */ + add #2, r8 /* image.palette */ + + mov.w @r2+, r12 /* command.edge2 */ + mov r5, r0 + + mov.l r13, @-r15 + shll r11 + + mov.l r14, @-r15 + shll r12 + + add #-4, r5 + mov #0x1e, r2 + TEX2D_START() -2: -3: nop - TEX2D_END() + + mov.b @r3+, r6 + mov #-4, r10 + + mov.l r0, @-r15 + + mov.w @(r0,r11), r13 + + mov.w @(r0,r12), r14 + shll r6 + + /* Main loop with 2 pixels sharing a single byte */ +2: mov r6, r0 + and #0x1e, r0 + + shld r10, r6 + + mov.w @(r0,r8), r0 + and r2, r6 + + mov.w r0, @(6,r5) + mov r6, r0 + + mov.b @r3+, r6 + add #4, r5 + + mov.w @(r0,r8), r0 + + mov.w r0, @r5 +3: shll r6 + + mov.l @r15+, r0 + mov r7, r10 + + shll2 r10 + + mov.w r13, @(r0,r11) + add r4, r10 + + mov.w r14, @(r0,r12) + add r0, r10 + + mov r10, r0 + /* Parallelizes with [dt r1] expanded from TEX2D_END_NORET() */ + + TEX2D_END_NORET() + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r12 + mov.l @r15+, r11 + mov.l @r15+, r10 + mov.l @r15+, r9 + rts + mov.l @r15+, r8 /* [Unsupported formats] P8 is unsupported, use P8_RGB565 and P8_RGB565A. */ diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c index 60a20e5..eef2ffa 100644 --- a/azur/src/gint/shaders/tex2d.c +++ b/azur/src/gint/shaders/tex2d.c @@ -44,17 +44,17 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, cmd.columns = width; cmd.image = image; - int input_multiplier = 1; - void const *data = image->data; + int row_stride; size_t cmd_size = sizeof cmd - 4; if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) { - input_multiplier = 0; - data += (image->data[0] * 2) + 2; + row_stride = image->width; + cmd.input = (void *)image->data + (image->data[0] * 2) + 2 + + top * row_stride + left; } else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) { - input_multiplier = -1; - data += 32; + row_stride = (image->width + 1) >> 1; + cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1); int odd_left = left & 1; int odd_right = (left + width) & 1; @@ -65,6 +65,10 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, x -= odd_left; cmd_size += 4; } + else { + row_stride = image->width << 1; + cmd.input = (void *)image->data + top * row_stride + (left << 1); + } /* This divides by azrp_frag_height */ cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4); @@ -72,9 +76,6 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, while(height > 0) { cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1))); - int input_offset = image->width * top + left; - input_offset = (input_offset << (input_multiplier + 1)) >> 1; - cmd.input = data + input_offset; cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x); y += cmd.lines; @@ -83,6 +84,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, azrp_queue_command(&cmd, cmd_size); cmd.fragment_id++; + cmd.input += row_stride * cmd.lines; } prof_leave(azrp_perf_cmdgen);