azur: implement support for P4_RGB565A (P4)

This commit is contained in:
Lephe 2021-09-26 14:17:52 +02:00 committed by Lephenixnoir
parent ddff9f6d6b
commit c16b1a85c6
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
3 changed files with 155 additions and 26 deletions

View File

@ -268,6 +268,8 @@ struct azrp_shader_tex2d_command {
int16_t lines;
/* Already offset by start row and column */
void const *input;
/* P4 modes only: */
int16_t edge1, edge2;
};
AZUR_END_DECLS

View File

@ -26,37 +26,36 @@ _azrp_shader_tex2d:
mov.w @r2+, r5 /* command.output (offset) */
sub r7, r4
mov.w @r2+, r1 /* command.lines */
mov.w @r8+, r9 /* image.profile */
sub r7, r4
mov.w @r8+, r0 /* image.profile */
mov.w @r2+, r1 /* command.lines */
add r6, r5
mov.l @r2+, r3 /* command.input (pointer) */
shll2 r9
mova .formats, r0
mov.w @r8+, r6 /* image.alpha */
mov.l @(r0,r9), r0
mov.w @r8+, r9 /* image.width */
mov.l @r2+, r3 /* command.input (pointer) */
mov r0, r2
mova .formats, r0
shll2 r2
/* Stall cycle */
mov.l @(r0, r2), r0
jmp @r0
/* Stall for r9 */
sub r7, r9
.align 4
.formats:
.long _RGB565
.long _RGB565A
.long _NOP
.long _P4
.long _NOP /* P8 */
.long _P4_RGB565A /* =P4 */
.long _P8_RGB565
.long _P8_RGB565A
.long _P4_RGB565
/* [Loop macros]
@ -414,15 +413,131 @@ _P8_RGB565.palette_distance:
/* Distance between image pointer and palette array base */
.word 260
/* [Rendering strategy for the P4 format] */
_P4:
/* [Rendering strategy for the P4_RGB565A format]
This is the most complex format. Most of the remarks that apply to
P8_RGB565A also apply here, except that there are less opportunities to save
computation because nibbles must be extracted anyway.
The P4_RGB565A format is simply bopti's P4, but an additional variation
P4_RGB565 is specified to save on transparency handling, which is very
expensive.
The special nature of the nibble packing means the simplest loop form writes
2 pixels from a 2-aligned source image position in a single iteration. Other
structures don't even come close: selecting nibbles individually is folly,
while not interweaving is inefficient. So the whole point of this routine is
to forcibly align the subimage on a byte-aligned and never break that grid.
The command builder for P4 does this alignment before submitting the
command. Obviously the transform can cause one extra pixel to be overridden
on each side of every line. The command is thus extended with two edge
offsets indicating pixels to preserve at each end. When overwrites occurs,
the edge offsets point to the overwritten pixels so they can be restored.
Otherwise, they point to the next pixels and the restores are no-ops. See
the strategy used for managing interweaving in P8 formats for details.
TODO: Asymptotic performance */
.align 4
_P4_RGB565A:
mov.l r10, @-r15
shlr r9
mov.l r11, @-r15
add #-1, r9 /* Input stride compensation for openness */
mov.l r12, @-r15
add #2, r8 /* image.palette */
mov.w @r2+, r11 /* command.edge1 */
shlr r7
mov.w @r2+, r12 /* command.edge2 */
mov r5, r10
mov.l r13, @-r15
shll r11
mov.l r14, @-r15
shll r12
TEX2D_START()
mov r10, r0
mov.b @r3+, r6
/* Stall for r0 */
mov.w @(r0,r11), r13
mov.w @(r0,r12), r14
/* Main loop with 2 pixels sharing a single byte */
2: /* Stall for r6 */
shll r6
mov r6, r0
and #0x1e, r0
tst r0, r0
bt 4f
mov.w @(r0,r8), r0
mov.w r0, @(2,r5)
4: shlr2 r6
shlr2 r6
mov r6, r0
and #0x1e, r0
tst r0, r0
bt 5f
mov.w @(r0,r8), r0
mov.w r0, @r5
5: mov.b @r3+, r6
3: add #4, r5
mov r10, r0
add r7, r10
/* Stall for r0 */
mov.w r13, @(r0,r11)
add r7, r10
mov.w r14, @(r0,r12)
add r4, r10
add r7, r10
add r7, r10
TEX2D_END_NORET()
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
/* [Rendering strategy for the P4_RGB565 format]
Same as P4_RGB565A without transparency checks (fairly straightforward). */
.align 4
_P4_RGB565:
TEX2D_START()
2:
3: nop
TEX2D_END()
/* [Unsupported formats]
P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
_NOP:
mov.l @r15+, r9

View File

@ -18,11 +18,12 @@ void azrp_shader_tex2d_configure(void)
//---
/* Profile IDs */
#define PX_RGB565 0
#define PX_RGB565A 1
#define PX_P4 3
#define PX_P8_RGB565 4
#define PX_P8_RGB565A 5
#define RGB565 0
#define RGB565A 1
#define P4_RGB565A 3
#define P8_RGB565 4
#define P8_RGB565A 5
#define P4_RGB565 6
void azrp_image(int x, int y, bopti_image_t const *image)
{
@ -45,14 +46,24 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
int input_multiplier = 1;
void const *data = image->data;
size_t cmd_size = sizeof cmd - 4;
if(image->profile == PX_P8_RGB565 || image->profile == PX_P8_RGB565A) {
if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
input_multiplier = 0;
data += (image->data[0] * 2) + 2;
}
else if(image->profile == PX_P4) {
else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
input_multiplier = -1;
data += 32;
int odd_left = left & 1;
int odd_right = (left + width) & 1;
cmd.edge1 = -1 + odd_left;
cmd.edge2 = width + odd_left;
cmd.columns += odd_left + odd_right;
x -= odd_left;
cmd_size += 4;
}
/* This divides by azrp_frag_height */
@ -61,7 +72,8 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
while(height > 0) {
cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
int input_offset = (image->width * top + left) << input_multiplier;
int input_offset = image->width * top + left;
input_offset = (input_offset << (input_multiplier + 1)) >> 1;
cmd.input = data + input_offset;
cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
@ -69,7 +81,7 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
top += cmd.lines;
height -= cmd.lines;
azrp_queue_command(&cmd, sizeof cmd);
azrp_queue_command(&cmd, cmd_size);
cmd.fragment_id++;
}