azur: documentation and optimization on rendering

This commit is contained in:
Lephe 2021-09-28 14:29:09 +02:00 committed by Lephenixnoir
parent c5cdb2b885
commit 911cc8e5ac
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
7 changed files with 182 additions and 85 deletions

View File

@ -10,7 +10,7 @@ if("${FXSDK_PLATFORM_LONG}" STREQUAL fxCG50)
endif()
# General options
add_compile_options(-Wall -Wextra -O2
add_compile_options(-Wall -Wextra -O3
-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
set(CMAKE_C_STANDARD 11)
@ -60,7 +60,7 @@ if(FACETS_PLATFORM STREQUAL emscripten)
set(PORTS -sUSE_SDL=2 -sUSE_SDL_IMAGE=2 -sSDL2_IMAGE_FORMATS=["png"])
add_compile_options(${PORTS})
add_link_options(${PORTS} -O2)
add_link_options(${PORTS} -O3)
endif()
#---

View File

@ -49,7 +49,7 @@ extern uint16_t azrp_frag[];
/* Maximum number of commands that can be queued. (This is only one of two
limits, the other being the size of the command data.) */
#define AZRP_MAX_COMMANDS 512
#define AZRP_MAX_COMMANDS 256
/* Maximum number of shaders that can be defined. (This is a loose limit). */
#define AZRP_MAX_SHADERS 32
@ -212,8 +212,8 @@ extern prof_t azrp_perf_shaders;
/* This counter runs during CPU transfers to the R61524 display. */
extern prof_t azrp_perf_r61524;
/* This counter runs during the whole azrp_update() operation; it is the sum of
sort, shaders, r61524, plus some logic overhead. */
/* This counter runs during rendering; it is the sum of shaders and r61524,
plus some logic overhead. */
extern prof_t azrp_perf_render;
/* azrp_perf_clear(): Clear all performance counters
@ -243,12 +243,10 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
/* azrp_queue_command(): Add a new command to be rendered next frame
The command must be a structure starting with an 8-bit shader ID and an
8-bit fragment ID.
Returns true on success, false if the maximum amount of commands or command
memory is exceeded. */
bool azrp_queue_command(void *command, size_t size);
The command must be a structure starting with an 8-bit shader ID. Returns
true on success, false if the maximum amount of commands or command memory
is exceeded. */
bool azrp_queue_command(void *command, size_t size, int fragment);
//---
// Internal shader definitions (for reference; no API guarantee)

View File

@ -8,7 +8,8 @@
#define YRAM ((void *)0xe5017000)
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM. */
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
TODO: Extend this to 16 rows, and move the rest to RAM */
GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
/* Super-scaling factor, width and height of output. */
@ -26,11 +27,11 @@ int azrp_frag_height;
GXRAM int commands_count = 0, commands_length = 0;
/* Array of pointers to queued commands (stored as an offset into YRAM). */
GXRAM uint16_t commands_array[AZRP_MAX_COMMANDS];
GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
/* Array of shader programs and uniforms. */
static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
/* Next free index in the shader program array. */
GXRAM static uint16_t shaders_next = 0;
@ -54,31 +55,25 @@ void azrp_clear_commands(void)
/* Custom quick sort for commands */
static inline int compare(int8_t *c1, int8_t *c2)
{
int d = c1[1] - c2[1];
return (d ? d : c1 - c2);
}
static void cmdsort(int low, int high)
{
if(low >= high) return;
int8_t *pivot = YRAM + commands_array[(low + high) >> 1];
uint32_t pivot = commands_array[(low + high) >> 1];
int i = low - 1;
int j = high + 1;
while(1) {
do i++;
while(compare(YRAM + commands_array[i], pivot) < 0);
while(commands_array[i] < pivot);
do j--;
while(compare(YRAM + commands_array[j], pivot) > 0);
while(commands_array[j] > pivot);
if(i >= j) break;
uint16_t tmp = commands_array[i];
uint32_t tmp = commands_array[i];
commands_array[i] = commands_array[j];
commands_array[j] = tmp;
}
@ -89,44 +84,48 @@ static void cmdsort(int low, int high)
void azrp_sort_commands(void)
{
prof_enter(azrp_perf_sort);
prof_enter_norec(azrp_perf_sort);
cmdsort(0, commands_count - 1);
prof_leave(azrp_perf_sort);
prof_leave_norec(azrp_perf_sort);
}
int azrp_commands_total;
void azrp_render_fragments(void)
{
prof_enter(azrp_perf_render);
prof_enter_norec(azrp_perf_render);
azrp_commands_total = 0;
int i = 0;
int frag = 0;
uint32_t next_frag_threshold = (frag + 1) << 16;
uint32_t cmd = commands_array[i];
uint8_t *cmd = (uint8_t *)YRAM + commands_array[i];
prof_enter(azrp_perf_r61524);
prof_enter_norec(azrp_perf_r61524);
r61524_start_frame(0, 244);
prof_leave(azrp_perf_r61524);
prof_leave_norec(azrp_perf_r61524);
while(1) {
if(cmd[1] == frag) {
if(shaders[cmd[0]]) {
prof_enter(azrp_perf_shaders);
shaders[cmd[0]](shader_uniforms[cmd[0]], cmd, azrp_frag);
prof_leave(azrp_perf_shaders);
}
cmd = YRAM + commands_array[++i];
}
else {
prof_enter(azrp_perf_r61524);
/* TODO: Consider xram_frame() by DMA in parallel? */
xram_frame(azrp_frag, 396 * 8);
prof_leave(azrp_perf_r61524);
frag++;
if(frag >= azrp_frag_count) break;
while(cmd < next_frag_threshold && i < commands_count) {
azrp_commands_total++;
uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
prof_enter_norec(azrp_perf_shaders);
shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
prof_leave_norec(azrp_perf_shaders);
cmd = commands_array[++i];
}
/* TODO: Consider xram_frame() by DMA in parallel? */
prof_enter_norec(azrp_perf_r61524);
xram_frame(azrp_frag, 396 * 8);
prof_leave_norec(azrp_perf_r61524);
if(++frag >= azrp_frag_count) break;
next_frag_threshold += (1 << 16);
}
prof_leave(azrp_perf_render);
prof_leave_norec(azrp_perf_render);
}
void azrp_update(void)
@ -210,7 +209,7 @@ void azrp_set_uniforms(int shader_id, void *uniforms)
shader_uniforms[shader_id] = uniforms;
}
bool azrp_queue_command(void *command, size_t size)
bool azrp_queue_command(void *command, size_t size, int fragment)
{
if(commands_count >= AZRP_MAX_COMMANDS)
return false;
@ -223,7 +222,8 @@ bool azrp_queue_command(void *command, size_t size)
for(size_t i = 0; i < size; i++)
dst[i] = src[i];
commands_array[commands_count++] = commands_length;
commands_array[commands_count++] =
(fragment << 16) | commands_length;
commands_length += size;
return true;

View File

@ -1,3 +1,18 @@
/* Azur's built-in shaders: <clear>
By far the easiest of all. The clear shader is a good benchmark for how fast
the rendering pipeline is. And it's pretty damn fast, clocking in at 400 µs
for a full-resolution 396x224 clear (compared to 6.1 ms for a VRAM clear by
CPU or 2.5 ms by DMA).
Because this is performed by CPU and therefore versatile, there are many
variations with more complex patterns that will perform at the same speed.
A gray tiled background for transparency in image viewing comes to mind, for
example.
Affected region: full-screen
Asymptotic performance: 0.5 cycle/pixel */
.global _azrp_shader_clear
.align 4

View File

@ -35,7 +35,7 @@ void azrp_clear(uint16_t color)
for(int i = 0; i < azrp_frag_count; i++) {
cmd.fragment_id = i;
azrp_queue_command(&cmd, sizeof cmd);
azrp_queue_command(&cmd, sizeof cmd, i);
}
prof_leave(azrp_perf_cmdgen);

View File

@ -1,3 +1,77 @@
/* Azur's built-in shaders: <image>
If there ever was a fantastic piece of assembler engineering in my work up
to this point, this would be it. Every trick in the book is used here, from
clever instruction combinations, pipeline flow and tricky DSP abuse all the
way up to memory layout planning, transforms on loop structures, and most
critically superscalar parallelism.
While the performance of the shader is not *strictly* proportional to the
speed of the tightest loop, it's very close. The use of operand-bus XRAM for
graphics data, systematic alignment, and detailed pipeline stalling
measurements for common instruction sequences in gintctl allow very accurate
speed predictions to be made based on the tightness of the code.
The palette formats of bopti have been refined for the purpose of this
shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
and P4 being renamed P4_RGB565A with minimal changes along with a variation
aptly named P4_RGB565.
The asymptotic performance for each format is as follows:
* RGB565: 1 cycle/pixel if source and destination align
2 cycles/pixel otherwise
* RGB565A: 4 cycles/pixel
* P8_RGB565A: 4.5 cycles/pixel
* P8_RGB565: 3 cycles/pixel
* P4_RGB565A: 5 cycles/pixel
* P4_RGB565: 3.5 cycles/pixel
Entirely documenting this code would take me hours, but some elements are
provided in the comments. Superscalar parallelism is most easily appreciated
by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
structural technique at play in this code is loop transforms.
Basically, a loop that loads a pixel, performs computations with it, and
writes the result is inefficient because of the RAW dependencies on most
operations (with full stall cycles between loads and computations, and
between computations and uses as addresses). Well-established loop
optimization literature has lots of techniques to help with this problem,
and I use two here:
* _Pipelining_ the loop consists in handling a single pixel over several
iterations by doing a little bit of work in each iteration. The data for
the pixel would move from register to register at each iteration, with the
loop code doing one stage's worth of computation on each register. (You
can view it as a diagonal iteration pattern in the pixel*instruction grid
if you like such visualizations.)
By increasing the number of pixels in the pipeline, a lot of independent
data can be obtained, reducing dependency pressure and allowing for
greater parallelism at the cost of more registers being used.
The use of pipelining in this shader is very modest, with 2 stages at
most, and usually only a couple of instructions being performed in advance
for the next pixel while the current one finishes processing. Register
assignments have some subtleties though since pressure is high overall.
* _Unrolling_ iterations of the loop consists in loading two (or more)
pixels at the start of each iteration so that we can work on one while
waiting for stalls and dependencies on the other.
Unlike pipelining, a loop iteration starts and ends with full pixels and
no work carries between iterations. Unrolling allows different pixels to
use different registers and generally better optimize the instruction
sequence, at the cost of only supporting pixel counts that are multipes of
the unrolling level.
Handling non-multiple sizes is the everlasting bane of unrolled loops,
sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
to only handle even sizes and neutralize unwanted pixels after the fact.
Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
almost all formats (except RGB556A which performs DSP trickery).
*/
.global _azrp_shader_image
.align 4
@ -64,19 +138,20 @@ _azrp_shader_image:
* r3 is the input (with stride r9, in bytes)
* There are r1 rows with r7 iterations each */
#define START() \
#define START() \
nop; /* 4-alignment */ \
ldrs 2f; \
ldre 3f; \
1: ldrc r7
#define END_NORET() \
#define END_NORET() \
dt r1; \
add r4, r5; \
bf.s 1b; \
add r9, r3
#define END() \
END_NORET(); \
#define END() \
END_NORET(); \
mov.l @r15+, r9; \
rts; \
mov.l @r15+, r8
@ -100,10 +175,10 @@ _azrp_shader_image:
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
not help because of the stall cycle between loading a register and using it
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
the word-based copy). Weaving iterations could help but would be too complex
here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
tileset shader) should aim for that route though. Also, movua.l followed by
mov.l is even slower (5 cycles). */
the word-based copy). Unrolling iterations could help but would be too
complex here (adding sub-cases); a super-heavy renderer with more hypotheses
(like a tileset shader) should aim for that route though. Also, movua.l
followed by mov.l is even slower (5 cycles). */
.align 4
_RGB565:
mov #8, r0 /* Maximum width for naive method */
@ -130,12 +205,14 @@ _RGB565.we:
tst r0, r5
bf _RGB565.we_do
/* This is 4-aligned */
_RGB565.we_de:
START()
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END()
.align 4
_RGB565.we_do:
add #-1, r7
@ -150,6 +227,7 @@ _RGB565.we_do:
movs.w x0, @r5+
END()
.align 4
_RGB565.wo:
tst r0, r5
bf _RGB565.wo_do
@ -163,6 +241,7 @@ _RGB565.wo_de:
movs.w x0, @r5+
END()
.align 4
_RGB565.wo_do:
START()
movs.w @r3+, x0
@ -173,6 +252,7 @@ _RGB565.wo_do:
END()
/* Naive method for small widths and opposite source/destination parity */
.align 4
_RGB565.naive:
START()
2: movs.w @r3+, x0
@ -210,12 +290,13 @@ _RGB565A:
The work needed for each pixel gets more difficult as we go, with alpha
being the major culprit due to its additional comparisons, jumps, and
limited interweaving opportunities due to conditionally-executed code.
limited optimization opportunities when unrolling due to conditionally-
executed code.
Because arithmetic is unavoidable and there are 1-cycle delays between both
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2 interwoven
iterations with an open structure. This fills the stall cycles and increases
parallelism significantly. Pure interweaving handbook.
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
iterations with a 2-stage pipeline structure. This fills the stall cycles
and increases parallelism significantly. Pure loop optimization handbook.
Dealing with odd widths is a major pain as usual. Instead of adding logic to
handle the extra pixel separately, this routine lets the loop overwrite it,
@ -252,7 +333,7 @@ _P8_RGB565A:
sub r7, r9
mov r7, r13
add #-2, r9 /* Input stride compensation for openness */
add #-2, r9 /* Input stride compensation for pipelining */
mov.l r12, @-r15
shlr r7
@ -281,7 +362,6 @@ _P8_RGB565A:
shll2 r2
add r4, r2
nop /* 4-alignment */
START()
@ -293,7 +373,7 @@ _P8_RGB565A:
mov.b @r3+, r10
tst r6, r6
/* 2-interwoven open main loop */
/* 2-unrolled 2-stage main loop */
2: add r6, r6
mov r6, r0
@ -346,7 +426,7 @@ _P8_RGB565:
sub r7, r9
mov r7, r13
add #-2, r9 /* Input stride compensation for openness */
add #-2, r9 /* Input stride compensation for pipelining */
mov.l r12, @-r15
shlr r7
@ -375,7 +455,6 @@ _P8_RGB565:
shll2 r2
add r4, r2
nop /* 4-alignment */
START()
@ -387,11 +466,14 @@ _P8_RGB565:
mov.b @r3+, r10
shll r0
/* 2-interwoven open main loop */
/* 2-unrolled 2-stage main loop */
2: mov.b @r3+, r6
shll r10
mov.w @(r0,r8), r0
/* This nop is not for show, it actually prevents the loop from slowing
down to 7 cycles /i, probably due to instruction reads alignment. */
nop
mov.w r0, @(4,r5)
mov r10, r0
@ -434,8 +516,8 @@ _P8_RGB565.palette_distance:
The special nature of the nibble packing means the simplest loop form writes
2 pixels from a 2-aligned source image position in a single iteration. Other
structures don't even come close: selecting nibbles individually is folly,
while not interweaving is inefficient. So the whole point of this routine is
to forcibly align the subimage on a byte-aligned and never break that grid.
while not unrolling is inefficient. So the whole point of this routine is to
forcibly align the subimage on a byte-aligned and never break that grid.
The command builder for P4 does this alignment before submitting the
command. Obviously the transform can cause one extra pixel to be overridden
@ -443,7 +525,7 @@ _P8_RGB565.palette_distance:
offsets indicating pixels to preserve at each end. When overwrites occurs,
the edge offsets point to the overwritten pixels so they can be restored.
Otherwise, they point to the next pixels and the restores are no-ops. See
the strategy used for managing interweaving in P8 formats for details.
the strategy used for managing unrolling in P8 formats for details.
The only irregularity is image width, which the command builder cannot
modify. It is rounded up to the next multiple of 2, then halved. There is a
@ -466,10 +548,10 @@ _P4_RGB565A:
mov.l r12, @-r15
sub r7, r9
mov.w @r2+, r11 /* command.edge1 */
add #2, r8 /* image.palette */
mov.w @r2+, r11 /* command.edge1 */
add #2, r8 /* image.palette */
mov.w @r2+, r12 /* command.edge2 */
mov.w @r2+, r12 /* command.edge2 */
mov r5, r0
mov.l r13, @-r15
@ -479,6 +561,7 @@ _P4_RGB565A:
shll r12
add #-4, r5
nop /* 4-alignment */
START()
@ -559,10 +642,10 @@ _P4_RGB565:
mov.l r12, @-r15
sub r7, r9
mov.w @r2+, r11 /* command.edge1 */
add #2, r8 /* image.palette */
mov.w @r2+, r11 /* command.edge1 */
add #2, r8 /* image.palette */
mov.w @r2+, r12 /* command.edge2 */
mov.w @r2+, r12 /* command.edge2 */
mov r5, r0
mov.l r13, @-r15

View File

@ -73,18 +73,19 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
/* This divides by azrp_frag_height */
cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
/* These settings only apply to the first fragment */
int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
cmd.lines = azrp_frag_height - first_y;
cmd.output = 2 * (azrp_width * first_y + x);
while(height > 0) {
cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
azrp_queue_command(&cmd, cmd_size, cmd.fragment_id);
cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
y += cmd.lines;
top += cmd.lines;
height -= cmd.lines;
azrp_queue_command(&cmd, cmd_size);
cmd.fragment_id++;
cmd.input += row_stride * cmd.lines;
cmd.lines = min(height, azrp_frag_height);
cmd.output = 2 * x;
}
prof_leave(azrp_perf_cmdgen);