From c16b1a85c6b263c4036883b7ddc67fe977808625 Mon Sep 17 00:00:00 2001 From: Lephe Date: Sun, 26 Sep 2021 14:17:52 +0200 Subject: [PATCH] azur: implement support for P4_RGB565A (P4) --- azur/include/azur/gint/render.h | 2 + azur/src/gint/shaders/tex2d.S | 149 ++++++++++++++++++++++++++++---- azur/src/gint/shaders/tex2d.c | 30 +++++-- 3 files changed, 155 insertions(+), 26 deletions(-) diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h index da1c8a9..93797a1 100644 --- a/azur/include/azur/gint/render.h +++ b/azur/include/azur/gint/render.h @@ -268,6 +268,8 @@ struct azrp_shader_tex2d_command { int16_t lines; /* Already offset by start row and column */ void const *input; + /* P4 modes only: */ + int16_t edge1, edge2; }; AZUR_END_DECLS diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S index bec3c47..f766ed7 100644 --- a/azur/src/gint/shaders/tex2d.S +++ b/azur/src/gint/shaders/tex2d.S @@ -26,37 +26,36 @@ _azrp_shader_tex2d: mov.w @r2+, r5 /* command.output (offset) */ sub r7, r4 - mov.w @r2+, r1 /* command.lines */ + mov.w @r8+, r9 /* image.profile */ sub r7, r4 - mov.w @r8+, r0 /* image.profile */ + mov.w @r2+, r1 /* command.lines */ add r6, r5 + mov.l @r2+, r3 /* command.input (pointer) */ + shll2 r9 + + mova .formats, r0 + mov.w @r8+, r6 /* image.alpha */ + mov.l @(r0,r9), r0 + mov.w @r8+, r9 /* image.width */ - mov.l @r2+, r3 /* command.input (pointer) */ - mov r0, r2 - - mova .formats, r0 - shll2 r2 - - /* Stall cycle */ - - mov.l @(r0, r2), r0 - jmp @r0 + /* Stall for r9 */ sub r7, r9 .align 4 .formats: .long _RGB565 .long _RGB565A - .long _NOP - .long _P4 + .long _NOP /* P8 */ + .long _P4_RGB565A /* =P4 */ .long _P8_RGB565 .long _P8_RGB565A + .long _P4_RGB565 /* [Loop macros] @@ -414,15 +413,131 @@ _P8_RGB565.palette_distance: /* Distance between image pointer and palette array base */ .word 260 -/* [Rendering strategy for the P4 format] */ -_P4: +/* [Rendering strategy for the P4_RGB565A format] + + This is the most complex format. Most of the remarks that apply to + P8_RGB565A also apply here, except that there are less opportunities to save + computation because nibbles must be extracted anyway. + + The P4_RGB565A format is simply bopti's P4, but an additional variation + P4_RGB565 is specified to save on transparency handling, which is very + expensive. + + The special nature of the nibble packing means the simplest loop form writes + 2 pixels from a 2-aligned source image position in a single iteration. Other + structures don't even come close: selecting nibbles individually is folly, + while not interweaving is inefficient. So the whole point of this routine is + to forcibly align the subimage on a byte-aligned and never break that grid. + + The command builder for P4 does this alignment before submitting the + command. Obviously the transform can cause one extra pixel to be overridden + on each side of every line. The command is thus extended with two edge + offsets indicating pixels to preserve at each end. When overwrites occurs, + the edge offsets point to the overwritten pixels so they can be restored. + Otherwise, they point to the next pixels and the restores are no-ops. See + the strategy used for managing interweaving in P8 formats for details. + + TODO: Asymptotic performance */ +.align 4 +_P4_RGB565A: + mov.l r10, @-r15 + shlr r9 + + mov.l r11, @-r15 + add #-1, r9 /* Input stride compensation for openness */ + + mov.l r12, @-r15 + add #2, r8 /* image.palette */ + + mov.w @r2+, r11 /* command.edge1 */ + shlr r7 + + mov.w @r2+, r12 /* command.edge2 */ + mov r5, r10 + + mov.l r13, @-r15 + shll r11 + + mov.l r14, @-r15 + shll r12 + + TEX2D_START() + + mov r10, r0 + mov.b @r3+, r6 + + /* Stall for r0 */ + + mov.w @(r0,r11), r13 + + mov.w @(r0,r12), r14 + + /* Main loop with 2 pixels sharing a single byte */ + +2: /* Stall for r6 */ + + shll r6 + + mov r6, r0 + and #0x1e, r0 + + tst r0, r0 + + bt 4f + mov.w @(r0,r8), r0 + + mov.w r0, @(2,r5) + 4: shlr2 r6 + + shlr2 r6 + + mov r6, r0 + and #0x1e, r0 + + tst r0, r0 + + bt 5f + mov.w @(r0,r8), r0 + + mov.w r0, @r5 + + 5: mov.b @r3+, r6 +3: add #4, r5 + + mov r10, r0 + add r7, r10 + + /* Stall for r0 */ + + mov.w r13, @(r0,r11) + add r7, r10 + + mov.w r14, @(r0,r12) + add r4, r10 + + add r7, r10 + add r7, r10 + + TEX2D_END_NORET() + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r12 + mov.l @r15+, r11 + mov.l @r15+, r10 + mov.l @r15+, r9 + rts + mov.l @r15+, r8 + +/* [Rendering strategy for the P4_RGB565 format] + Same as P4_RGB565A without transparency checks (fairly straightforward). */ +.align 4 +_P4_RGB565: TEX2D_START() 2: 3: nop TEX2D_END() /* [Unsupported formats] - P8 is unsupported, use P8_RGB565 and P8_RGB565A. */ _NOP: mov.l @r15+, r9 diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c index fff6fab..60a20e5 100644 --- a/azur/src/gint/shaders/tex2d.c +++ b/azur/src/gint/shaders/tex2d.c @@ -18,11 +18,12 @@ void azrp_shader_tex2d_configure(void) //--- /* Profile IDs */ -#define PX_RGB565 0 -#define PX_RGB565A 1 -#define PX_P4 3 -#define PX_P8_RGB565 4 -#define PX_P8_RGB565A 5 +#define RGB565 0 +#define RGB565A 1 +#define P4_RGB565A 3 +#define P8_RGB565 4 +#define P8_RGB565A 5 +#define P4_RGB565 6 void azrp_image(int x, int y, bopti_image_t const *image) { @@ -45,14 +46,24 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, int input_multiplier = 1; void const *data = image->data; + size_t cmd_size = sizeof cmd - 4; - if(image->profile == PX_P8_RGB565 || image->profile == PX_P8_RGB565A) { + if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) { input_multiplier = 0; data += (image->data[0] * 2) + 2; } - else if(image->profile == PX_P4) { + else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) { input_multiplier = -1; data += 32; + + int odd_left = left & 1; + int odd_right = (left + width) & 1; + + cmd.edge1 = -1 + odd_left; + cmd.edge2 = width + odd_left; + cmd.columns += odd_left + odd_right; + x -= odd_left; + cmd_size += 4; } /* This divides by azrp_frag_height */ @@ -61,7 +72,8 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, while(height > 0) { cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1))); - int input_offset = (image->width * top + left) << input_multiplier; + int input_offset = image->width * top + left; + input_offset = (input_offset << (input_multiplier + 1)) >> 1; cmd.input = data + input_offset; cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x); @@ -69,7 +81,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image, top += cmd.lines; height -= cmd.lines; - azrp_queue_command(&cmd, sizeof cmd); + azrp_queue_command(&cmd, cmd_size); cmd.fragment_id++; }