azur: implement all parity cases of the tex2d shader

This commit is contained in:
Lephe 2021-08-17 18:33:17 +02:00 committed by Lephenixnoir
parent 507fee11e3
commit 0005249f71
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
3 changed files with 136 additions and 49 deletions

View File

@ -16,9 +16,9 @@
// fragment shaders.
//
// The command queue stores all rendering commands, split into fragments. Each
// fragment needs to read through all commands to produce its output, and
// because fragments are rendered in order, the sequence of commands must be
// read several times, therefore stored.
// fragment needs to read through a number of commands, and the order does not
// match the order of API calls because each API call typically impacts several
// fragments. Therefore commands need to be stored.
//
// Fragment shaders are the programs that render commands into graphics data
// for each fragments. They are pretty similar to OpenGL shaders, in that they
@ -115,26 +115,24 @@ void azrp_image(int x, int y, uint16_t *pixels, int w, int h, int stride);
// use them, so they are safe to write to and reset when they're not running.
//---
/* This counter runs during command generation and enqueue operations, usually
between azrp_begin_frame() and azrp_render_frame(). */
/* This counter runs during command generation and queue operations. */
extern prof_t azrp_perf_cmdgen;
/* This counter runs during the command sorting step, which occurs at the start
of azrp_render_frame(). */
/* This counter runs during the command sorting step. */
extern prof_t azrp_perf_sort;
/* This counter runs during shader executions in arzp_render_frame(). */
/* This counter runs during shader executions in arzp_render_fragments(). */
extern prof_t azrp_perf_shaders;
/* This counter runs during CPU transfers to the R61524 display. */
extern prof_t azrp_perf_r61524;
/* This counter runs during the whole azrp_frame_render() operation; it is the
sum of sort, shaders, r61524, plus some logic overhead. */
/* This counter runs during the whole azrp_update() operation; it is the sum of
sort, shaders, r61524, plus some logic overhead. */
extern prof_t azrp_perf_render;
/* azrp_perf_clear(): Clear all performance counters
Generally you want to do this before azrp_frame_begin(). */
Generally you want to do this before azrp_update(). */
void azrp_perf_clear(void);
//---
@ -172,12 +170,13 @@ struct azrp_shader_tex2d_command {
int16_t columns;
/* Already offset by start row and column */
void *input;
/* Destination in XRAM */
void *output;
/* Destination in XRAM (offset) */
uint16_t output;
/* Number of lines */
int16_t lines;
/* Distance between two lines (columns excluded) */
int16_t stride;
};
} GPACKED(2);
AZUR_END_DECLS

View File

@ -132,8 +132,13 @@ bool azrp_queue_command(void *command, size_t size)
if(commands_length + size >= 8192)
return false;
uint8_t *dst = YRAM + commands_length;
uint8_t *src = command;
for(size_t i = 0; i < size; i++)
dst[i] = src[i];
commands_array[commands_count++] = commands_length;
memcpy(YRAM + commands_length, command, size);
commands_length += size;
return true;

View File

@ -1,60 +1,143 @@
.global _azrp_shader_tex2d
.align 4
/* TODO [scaling]: Pass the _792 constant and fragment address as uniform */
/* Register assignment
r0: (temporary)
r1: Lines
r2: Columns
r3: Input
r4: Output
r5: Command queue
r7: Constant 396*2 = 0x318
r8: Output stride
r9: Input stride */
r5: Command queue; (temporary)
r6: (temporary)
r7: Output stride
r8: Input stride */
_azrp_shader_tex2d:
mov.l r8, @-r15
mov.w _792, r7
add #2, r5
mov.l r9, @-r15
mov #0x03, r7
mov.w @r5+, r2 /* Columns */
ldrs 1f
shll8 r7
mov.l r8, @-r15
ldre 2f
add #0x18, r7
mov.w @r5+, r6 /* Input (1/2) */
sub r2, r7
/* CHECK: 4-alignment here */
mov.w @r5+, r3 /* Input (2/2) */
sub r2, r7
.texture:
mov.w @r5+, r2 /* Columns */
mov r7, r8
mov.w @r5+, r4 /* Output offset */
mov.l @r5+, r3 /* Input */
mov.w @r5+, r1 /* Lines */
shll16 r3
mov r2, r0
mov.l @r5+, r4 /* Output */
xtrct r6, r3
mov.l .fragment, r6
shll r0
mov.w @r5+, r1 /* Lines */
mov.w @r5+, r8 /* Input stride */
mov #8, r0 /* Maximum width for naive method */
sub r0, r8
mov.w @r5+, r9 /* Input stride */
add r6, r4
cmp/ge r2, r0
bt.s .naive
mov #2, r0
/* The following variations are named based on the parity of each parameter:
* w[eo] (width even, width odd)
* d[eo] (data even, data odd)
where even/odd means 4-aligned/2-aligned in terms of pointers.
When the destination and source have identical parity, the copy is pretty
direct and takes 2 cycles to copy 4 bytes. When they have opposite parity
however, longwords need to be rearranged, which is a problem: arithmetic
operations under a RAW dependency take 3 cycles, so there's no way to
complete the 4-byte copy in less than 4 cycles unless iterations are opened
and weaved, which would add too much sub-cases. So in this case the naive
method that copies 4 bytes in 4 cycles is used. A very heavy image renderer
like a tileset shader should consider the optimized route though. */
#define TEX2D_START() \
ldrs 2f; \
ldre 3f; \
\
1: ldrc r2; \
dt r1; \
#define TEX2D_END() \
add r7, r4; \
bf.s 1b; \
add r8, r3; \
\
rts; \
mov.l @r15+, r8
.case_analysis:
/* Use naive method for opposite source/destination parity */
mov r4, r6
xor r3, r6
tst r0, r6
bf .naive
shlr r2
bt .wo
.line:
ldrc r2
dt r1
.we:
tst r0, r4
bf .we_do
1: movs.l @r3+, x0
2: movs.l x0, @r4+
.we_de:
TEX2D_START()
2: movs.l @r3+, x0
3: movs.l x0, @r4+
TEX2D_END()
add r8, r4
.we_do:
add #-1, r2
bf.s .line
add r9, r3
TEX2D_START()
movs.w @r3+, x0
movs.w x0, @r4+
.end:
mov.l @r15+, r9
rts
mov.l @r15+, r8
2: movs.l @r3+, x0
3: movs.l x0, @r4+
movs.w @r3+, x0
movs.w x0, @r4+
TEX2D_END()
.wo:
tst r0, r4
bf .wo_do
.wo_de:
TEX2D_START()
2: movs.l @r3+, x0
3: movs.l x0, @r4+
movs.w @r3+, x0
movs.w x0, @r4+
TEX2D_END()
.wo_do:
TEX2D_START()
movs.w @r3+, x0
movs.w x0, @r4+
2: movs.l @r3+, x0
3: movs.l x0, @r4+
TEX2D_END()
/* Naive method for small widths and opposite source/destination parity */
.naive:
TEX2D_START()
2: movs.w @r3+, x0
3: movs.w x0, @r4+
TEX2D_END()
.align 4
.fragment:
.long _azrp_frag
_792:
.word 792