From 0fec6da1c44532ae259d18c893269833f2089a6c Mon Sep 17 00:00:00 2001 From: Lephe Date: Fri, 27 Aug 2021 14:23:53 +0200 Subject: [PATCH] azur: progress on tex2d as bopti, custom command sorter --- azur/include/azur/gint/render.h | 19 +++-- azur/src/gint/render.c | 52 +++++++++--- azur/src/gint/shaders/clear.c | 14 ++- azur/src/gint/shaders/tex2d.S | 145 ++++++++++++++++++++++++-------- azur/src/gint/shaders/tex2d.c | 52 ++++++++++++ 5 files changed, 218 insertions(+), 64 deletions(-) diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h index 527d631..da1c8a9 100644 --- a/azur/include/azur/gint/render.h +++ b/azur/include/azur/gint/render.h @@ -113,6 +113,8 @@ extern int azrp_width, azrp_height; extern int azrp_frag_count; /* Offset of first fragment. */ extern int azrp_frag_offset; +/* Height of fragments. */ +extern int azrp_frag_height; /* azrp_config_scale(): Select the renderer's super-scaling factor @@ -178,7 +180,11 @@ extern uint8_t AZRP_SHADER_TEX2D; void azrp_clear(uint16_t color); /* azrp_image(): Queue image command [AZRP_SHADER_TEX2D] */ -void azrp_image(int x, int y, uint16_t *pixels, int w, int h, int stride); +void azrp_image(int x, int y, bopti_image_t const *image); + +/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_TEX2D] */ +void azrp_subimage(int x, int y, bopti_image_t const *image, + int left, int top, int width, int height, int flags); /* Functions to update uniforms for these shaders. You should call them when: * AZRP_SHADER_CLEAR: Changing super-scaling settings. @@ -254,15 +260,14 @@ struct azrp_shader_tex2d_command { uint8_t fragment_id; /* Pixels per line */ int16_t columns; - /* Already offset by start row and column */ - void *input; + /* Address of the image structure */ + bopti_image_t const *image; /* Destination in XRAM (offset) */ uint16_t output; /* Number of lines */ int16_t lines; - /* Distance between two lines (columns excluded) */ - int16_t stride; - -} GPACKED(2); + /* Already offset by start row and column */ + void const *input; +}; AZUR_END_DECLS diff --git a/azur/src/gint/render.c b/azur/src/gint/render.c index fd66e28..457ebc9 100644 --- a/azur/src/gint/render.c +++ b/azur/src/gint/render.c @@ -17,6 +17,10 @@ int azrp_width, azrp_height; /* Offset of first fragment for alignment, and number of fragments. */ int azrp_frag_offset; int azrp_frag_count; +/* Height of fragment. */ +int azrp_frag_height; + +/* TODO: Either make command queue private or use azrp_ prefix */ /* Number and total size of queued commands. */ GXRAM int commands_count = 0, commands_length = 0; @@ -48,26 +52,45 @@ void azrp_clear_commands(void) commands_length = 0; } -static int compare_commands(void const *c1, void const *c2) +/* Custom quick sort for commands */ + +static inline int compare(int8_t *c1, int8_t *c2) { - uint16_t offset1 = *(uint16_t *)c1; - uint16_t offset2 = *(uint16_t *)c2; + int d = c1[1] - c2[1]; + return (d ? d : c1 - c2); +} - uint8_t *ptr1 = (uint8_t *)(0xe5017000 + offset1); - uint8_t *ptr2 = (uint8_t *)(0xe5017000 + offset2); +static void cmdsort(int low, int high) +{ + if(low >= high) return; - int diff_fragments = (int)ptr1[1] - (int)ptr2[1]; - if(diff_fragments) return diff_fragments; + int8_t *pivot = YRAM + commands_array[(low + high) >> 1]; - return (int)offset1 - (int)offset2; + int i = low - 1; + int j = high + 1; + + while(1) { + do i++; + while(compare(YRAM + commands_array[i], pivot) < 0); + + do j--; + while(compare(YRAM + commands_array[j], pivot) > 0); + + if(i >= j) break; + + uint16_t tmp = commands_array[i]; + commands_array[i] = commands_array[j]; + commands_array[j] = tmp; + } + + cmdsort(low, j); + cmdsort(j+1, high); } void azrp_sort_commands(void) { prof_enter(azrp_perf_sort); - /* TODO: azrp_sort_commands: Use a custom sorter */ - qsort(commands_array, commands_count, sizeof commands_array[0], - compare_commands); + cmdsort(0, commands_count - 1); prof_leave(azrp_perf_sort); } @@ -95,6 +118,7 @@ void azrp_render_fragments(void) } else { prof_enter(azrp_perf_r61524); + /* TODO: Consider xram_frame() by DMA in parallel? */ xram_frame(azrp_frag, 396 * 8); prof_leave(azrp_perf_r61524); frag++; @@ -129,11 +153,11 @@ static void update_frag_count(void) static void update_size(void) { if(azrp_scale == 1) - azrp_width = 396, azrp_height = 198; + azrp_width = 396, azrp_height = 198, azrp_frag_height = 8; else if(azrp_scale == 2) - azrp_width = 198, azrp_height = 112; + azrp_width = 198, azrp_height = 112, azrp_frag_height = 16; else if(azrp_scale == 3) - azrp_width = 132, azrp_height = 75; + azrp_width = 132, azrp_height = 75, azrp_frag_height = 16; } void azrp_config_scale(int scale) diff --git a/azur/src/gint/shaders/clear.c b/azur/src/gint/shaders/clear.c index 4e0354b..cf1fefa 100644 --- a/azur/src/gint/shaders/clear.c +++ b/azur/src/gint/shaders/clear.c @@ -11,15 +11,7 @@ static void register_shader(void) void azrp_shader_clear_configure(void) { - int longs_in_fragment = 0; - - if(azrp_scale == 1) - longs_in_fragment = (396 * 2) * 8 / 4; - else if(azrp_scale == 2) - longs_in_fragment = (198 * 2) * 16 / 4; - else if(azrp_scale == 3) - longs_in_fragment = (132 * 2) * 16 / 4; - + int longs_in_fragment = (azrp_width * azrp_frag_height / 2); azrp_set_uniforms(AZRP_SHADER_CLEAR, (void *)longs_in_fragment); } @@ -35,6 +27,8 @@ struct command { void azrp_clear(uint16_t color) { + prof_enter(azrp_perf_cmdgen); + struct command cmd; cmd.shader_id = AZRP_SHADER_CLEAR; cmd.color = color; @@ -43,4 +37,6 @@ void azrp_clear(uint16_t color) cmd.fragment_id = i; azrp_queue_command(&cmd, sizeof cmd); } + + prof_leave(azrp_perf_cmdgen); } diff --git a/azur/src/gint/shaders/tex2d.S b/azur/src/gint/shaders/tex2d.S index 6c65da1..f71a67a 100644 --- a/azur/src/gint/shaders/tex2d.S +++ b/azur/src/gint/shaders/tex2d.S @@ -1,58 +1,70 @@ .global _azrp_shader_tex2d .align 4 +/* Profile values from bopti */ +#define PX_RGB565 0 +#define PX_RGB565A 1 +#define PX_P8 2 +#define PX_P4 3 + /* Register assignment r0: (temporary) r1: Lines r2: Output r3: Input - r4: Output stride (initially uniform: azrp_width*2) - r5: Command queue; (temporary) - r6: (temporary) (initially azrp_frag) + r4: [parameter] azrp_width*2; output stride + r5: [parameter] Command queue; (temporary) + r6: [parameter] azrp_frag; (temporary) r7: Columns - r8: Input stride */ + r8: Input stride + r9: Image profile */ _azrp_shader_tex2d: + mov.l r8, @-r15 add #2, r5 - mov.w @r5+, r7 /* Columns */ + mov.l r9, @-r15 - mov.l r8, @-r15 + mov.w @r5+, r7 /* command.columns */ - mov.w @r5+, r0 /* Input (1/2) */ + mov.l @r5+, r8 /* command.image */ + + mov.w @r5+, r2 /* command.output (offset) */ sub r7, r4 - mov.w @r5+, r3 /* Input (2/2) */ + mov.w @r5+, r1 /* command.lines */ sub r7, r4 - mov.w @r5+, r2 /* Output offset */ - - mov.w @r5+, r1 /* Lines */ - shll16 r3 - - xtrct r0, r3 - - mov.w @r5+, r8 /* Input stride */ - mov #8, r0 /* Maximum width for naive method */ - + mov.w @r8+, r0 /* image.profile */ add r6, r2 - cmp/ge r7, r0 - bt.s .naive - mov #2, r0 + mov.w @r8+, r6 /* image.alpha */ + cmp/eq #PX_P4, r0 -/* The following variations are named based on the parity of each parameter: - * w[eo] (width even, width odd) - * d[eo] (data even, data odd) - where even/odd means 4-aligned/2-aligned in terms of pointers. + mov.w @r8, r8 /* image.width */ - When the destination and source have identical parity, the copy is pretty - direct and takes 2 cycles to copy 4 bytes. When they have opposite parity - however, longwords need to be rearranged, which is a problem: arithmetic - operations under a RAW dependency take 3 cycles, so there's no way to - complete the 4-byte copy in less than 4 cycles unless iterations are opened - and weaved, which would add too much sub-cases. So in this case the naive - method that copies 4 bytes in 4 cycles is used. A very heavy image renderer - like a tileset shader should consider the optimized route though. */ + mov.l @r5+, r3 /* command.input (pointer) */ + + sub r7, r8 + + bt.s .format_P4 + shll r8 + + cmp/eq #PX_P8, r0 + + bt .format_P8 + cmp/eq #PX_RGB565A, r0 + + bt .format_RGB565A + + /* Default below is .format_RGB565 */ + +/* [Loop macros] + + The following macros implement the main loop of the image renderer. + * Each line is rendered in the tight loop between 2: and 3: (both included). + * r2 is the output (with stride r4, in bytes) + * r3 is the input (with stride r8, in bytes) + * There are r1 rows with r7 iterations each */ #define TEX2D_START() \ ldrs 2f; \ @@ -66,10 +78,41 @@ _azrp_shader_tex2d: bf.s 1b; \ add r8, r3; \ \ + mov.l @r15+, r9; \ rts; \ mov.l @r15+, r8 -.case_analysis: +/* [Rendering strategy for the RGB565 format] + + In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can + optimize by moving longwords. Since longwords are pairs of pixels, there are + variations and subcases based on the parity of each parameter: + + * w[eo] denotes whether the width of the image is even or odd; + * d[eo] denotes whether the memory accesses to the source and destination + are even (4-aligned) or odd (2-aligned). + + When the destination and source have identical parity, the d[eo] variation + can be defined. In this case the copy is pretty direct, it's a longword copy + and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the + start or end address if 2-aligned. + + However, when they have opposite parity, each longword read matches up with + a 2-aligned write (or vice-versa). Rearranging words with arithmetic does + not help because of the stall cycle between loading a register and using it + in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as + the word-based copy). Weaving iterations could help but would be too complex + here (adding sub-cases); a super-heavy renderer with more hypotheses (like a + tileset shader) should aim for that route though. Also, movua.l followed by + mov.l is even slower (5 cycles). */ + +.format_RGB565: + mov #8, r0 /* Maximum width for naive method */ + cmp/ge r7, r0 + + bt.s .naive + mov #2, r0 + /* Use naive method for opposite source/destination parity */ mov r2, r6 xor r3, r6 @@ -131,3 +174,37 @@ _azrp_shader_tex2d: 2: movs.w @r3+, x0 3: movs.w x0, @r2+ TEX2D_END() + +/* [Rendering strategy for the RGB565A format] + + Since we have to check for the alpha value in each pixel, there's really no + longword-based optimization. Instead, we just go as fast as possible with + each pixels, using DSP instructions. Branchless jump is pretty useful. + + TODO: Opening iterations will definitely save at least 1 cycle per pixel; it + just requires a subcase for extremely small images (width = 1). */ + +.format_RGB565A: + mov r2, r5 + + TEX2D_START() + /* In the comparison, DC=1 if x0 == image.alpha */ +2: movs.w @r3+, x0 + pcmp x0, y0 movx.w @r5, x1 + dct pcopy x1, x0 +3: movx.w x0, @r5+ + TEX2D_END() + +/* [Rendering strategy for the P8 format] */ +.format_P8: + TEX2D_START() +2: +3: + TEX2D_END() + +/* [Rendering strategy for the P4 format] */ +.format_P4: + TEX2D_START() +2: +3: + TEX2D_END() diff --git a/azur/src/gint/shaders/tex2d.c b/azur/src/gint/shaders/tex2d.c index 6d410e0..5f2f9e9 100644 --- a/azur/src/gint/shaders/tex2d.c +++ b/azur/src/gint/shaders/tex2d.c @@ -1,4 +1,5 @@ #include +#include uint8_t AZRP_SHADER_TEX2D = -1; @@ -15,3 +16,54 @@ void azrp_shader_tex2d_configure(void) } //--- + +/* Profile values from bopti */ +#define PX_RGB565 0 +#define PX_RGB565A 1 +#define PX_P8 2 +#define PX_P4 3 + +void azrp_image(int x, int y, bopti_image_t const *image) +{ + azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0); +} + +void azrp_subimage(int x, int y, bopti_image_t const *image, + int left, int top, int width, int height, int flags) +{ + prof_enter(azrp_perf_cmdgen); + + if(!(flags & DIMAGE_NOCLIP)) { + /* TODO: tex2d: clip function */ + } + + struct azrp_shader_tex2d_command cmd; + cmd.shader_id = AZRP_SHADER_TEX2D; + cmd.columns = width; + cmd.image = image; + + int input_multiplier = 1; + if(image->profile == PX_P8) input_multiplier = 0; + if(image->profile == PX_P4) input_multiplier = -1; + + /* This divides by azrp_frag_height */ + cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4); + + while(height > 0) { + cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1))); + + int input_offset = (image->width * top + left) << input_multiplier; + cmd.input = (void *)image->data + input_offset; + + cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x); + + y += cmd.lines; + top += cmd.lines; + height -= cmd.lines; + + azrp_queue_command(&cmd, sizeof cmd); + cmd.fragment_id++; + } + + prof_leave(azrp_perf_cmdgen); +}