diff --git a/azur/CMakeLists.txt b/azur/CMakeLists.txt index 1a8f850..7815a75 100644 --- a/azur/CMakeLists.txt +++ b/azur/CMakeLists.txt @@ -28,10 +28,32 @@ endif() if(AZUR_GRAPHICS_GINT_CG) list(APPEND SOURCES src/gint/render.c + src/gint/r61524.s + # Clear shader src/gint/shaders/clear.c src/gint/shaders/clear.S + # Image shader src/gint/shaders/image.c - src/gint/shaders/image.S) + src/gint/shaders/image_rgb16_normal.S + src/gint/shaders/image_rgb16_clearbg.S + src/gint/shaders/image_rgb16_swapcolor.S + src/gint/shaders/image_rgb16_dye.S + src/gint/shaders/image_p8_normal.S + src/gint/shaders/image_p8_swapcolor.S + src/gint/shaders/image_p4_normal.S + # Image shader interface + src/gint/shaders/image_rgb16.c + src/gint/shaders/image_rgb16_effect.c + src/gint/shaders/image_rgb16_swapcolor.c + src/gint/shaders/image_rgb16_dye.c + src/gint/shaders/image_p8.c + src/gint/shaders/image_p8_effect.c + src/gint/shaders/image_p8_swapcolor.c + src/gint/shaders/image_p8_dye.c + src/gint/shaders/image_p4.c + src/gint/shaders/image_p4_effect.c + src/gint/shaders/image_p4_swapcolor.c + src/gint/shaders/image_p4_dye.c) endif() add_library(azur STATIC ${SOURCES}) diff --git a/azur/include/azur/defs.h b/azur/include/azur/defs.h index 953c319..73b9726 100644 --- a/azur/include/azur/defs.h +++ b/azur/include/azur/defs.h @@ -1,5 +1,5 @@ //--- -// azur.defs: Generation definitions +// azur.defs: General definitions that are included in every file //--- /* This exposes compile-time configuration symbols. I don't like running the diff --git a/azur/include/azur/gint/render.h b/azur/include/azur/gint/render.h index 30f773a..794f06e 100644 --- a/azur/include/azur/gint/render.h +++ b/azur/include/azur/gint/render.h @@ -33,8 +33,8 @@ #include AZUR_BEGIN_DECLS -#include #include +#include #include @@ -45,7 +45,7 @@ AZUR_BEGIN_DECLS typedef void azrp_shader_t(void *uniforms, void *command, void *fragment); /* Video memory fragment used as rendering target (in XRAM). */ -extern uint16_t azrp_frag[]; +extern uint16_t *azrp_frag; /* Maximum number of commands that can be queued. (This is only one of two limits, the other being the size of the command data.) */ @@ -128,19 +128,19 @@ extern int azrp_frag_height; The settings on each mode are as follow: * x1: Display resolution: 396x224 - Fragment size: 8 rows (6336 bytes) + Fragment size: 16 rows (12672 bytes) Number of fragments: 28 (29 if an offset is used) - Total size of graphics data: 177.408 kB + Total size of graphics data: 177'408 bytes * x2: Display resolution: 198x112 - Fragment size: 16 rows (6336 bytes) + Fragment size: 16 rows (6336 bytes) # TODO: increase Number of fragments 7 (8 if an offset if used) - Total size of graphics data: 44.352 kB + Total size of graphics data: 44'352 bytes * x3: Display resolution: 132x75 (last row only has 2/3 pixels) - Fragment size: 16 rows (4224 bytes) + Fragment size: 16 rows (4224 bytes) # TODO: increase Number of fragments: 5 (sometimes 6 if an offset is used) - Total size of graphics data: 19.800 kB + Total size of graphics data: 19'800 bytes As one would know when playing modern video games, super-resolution is one of the most useful ways to increase performance. The reduced amount of @@ -167,30 +167,50 @@ void azrp_config_scale(int scale); @offset Fragment offset along the y-axis (0 ... height of fragment-1). */ void azrp_config_frag_offset(int offset); +//--- +// Hooks +//--- + +/* Hook called before a fragment is sent to the display. The fragment can be + accessed and modified freeely (however, the time spent in the hook is + counted as overhead and only part of [azrp_perf_render]). */ +typedef void azrp_hook_prefrag_t(int id, void *fragment, int size); + +/* Get or set the prefrag hook. */ +azrp_hook_prefrag_t *azrp_hook_get_prefrag(void); +void azrp_hook_set_prefrag(azrp_hook_prefrag_t *); + //--- // Standard shaders //--- - /* Clears the entire output with a single color */ +/* Clears the entire output with a single color */ extern uint8_t AZRP_SHADER_CLEAR; - /* Renders RGB565 textures/images */ -extern uint8_t AZRP_SHADER_IMAGE; +/* Renders gint images with various dynamic effects */ +extern uint8_t AZRP_SHADER_IMAGE_RGB16; +extern uint8_t AZRP_SHADER_IMAGE_P8; +extern uint8_t AZRP_SHADER_IMAGE_P4; /* azrp_clear(): Clear output [ARZP_SHADER_CLEAR] */ void azrp_clear(uint16_t color); -/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE] */ +/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE_*] */ void azrp_image(int x, int y, bopti_image_t const *image); -/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE] */ +/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE_*] */ void azrp_subimage(int x, int y, bopti_image_t const *image, int left, int top, int width, int height, int flags); +/* See below for more detailed image functions. Dynamic effects are provided + with the same naming convention as gint. */ + /* Functions to update uniforms for these shaders. You should call them when: * AZRP_SHADER_CLEAR: Changing super-scaling settings. - * AZRP_SHADER_IMAGE: Changing super-scaling or or fragment offsets. */ + * AZRP_SHADER_IMAGE_*: Changing super-scaling or or fragment offsets. */ void azrp_shader_clear_configure(void); -void azrp_shader_image_configure(void); +void azrp_shader_image_rgb16_configure(void); +void azrp_shader_image_p8_configure(void); +void azrp_shader_image_p4_configure(void); //--- // Performance indicators @@ -250,32 +270,79 @@ void azrp_set_uniforms(int shader_id, void *uniforms); exceeded. */ bool azrp_queue_command(void *command, size_t size, int fragment, int count); +/* azrp_queue_image(): Split and queue a gint image command + + The command must have been completely prepared with gint_image_mkcmd() and + have had its color effect sections filled. This function sets the shader ID + and adjusts the command for fragmented rendering. */ +void azrp_queue_image(struct gint_image_box *box, image_t const *img, + struct gint_image_cmd *cmd); + //--- -// Internal shader definitions (for reference; no API guarantee) +// Internal R61524 functions //--- -struct azrp_shader_image_command { - uint8_t shader_id; - /* First edge-preserved pixel offset (P4 only) */ - int8_t edge1; - /* Pixels per line */ - int16_t columns; - /* Address of the image structure */ - bopti_image_t const *image; - /* Destination in XRAM (offset) */ - uint16_t output; - /* Number of lines */ - int16_t lines; - /* Already offset by start row and column */ - void const *input; +void azrp_r61524_fragment_x1(void *fragment, int size); - /* Info for structure update between fragments: */ - int16_t height; - int16_t row_stride; - int16_t x; +void azrp_r61524_fragment_x2(void *fragment, int width, int height); - /* Second edge-preserved pixel offset (P4 only) */ - int16_t edge2; -}; +//--- +// Internal functions for the image shader +// +// We use gint's image rendering API but replace some of the core loops with +// Azur-specific versions that are faster in the CPU-bound context of this +// rendering engine. Some of the main loops from Azur actually perform better +// in RAM than bopti used to do, and are already in gint. +//--- + +/* azrp_image_effect(): Generalized azrp_image() with dynamic effects */ +#define azrp_image_effect(x, y, img, eff, ...) \ + azrp_image_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \ + ##__VA_ARGS__) +/* azrp_subimage_effect(): Generalized azrp_subimage() with dynamic effects */ +void azrp_subimage_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int effects, ...); + +/* Specific versions for each format */ +#define AZRP_IMAGE_SIG1(NAME, ...) \ + void azrp_image_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \ + void azrp_subimage_ ## NAME(int x, int y, image_t const *img, \ + int left, int top, int w, int h, ##__VA_ARGS__); +#define AZRP_IMAGE_SIG(NAME, ...) \ + AZRP_IMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \ + AZRP_IMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \ + AZRP_IMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__) + +AZRP_IMAGE_SIG(_effect, int effects, ...) +AZRP_IMAGE_SIG(, int effects) +AZRP_IMAGE_SIG(_clearbg, int effects, int bg_color_or_index) +AZRP_IMAGE_SIG(_swapcolor, int effects, int source, int replacement) +AZRP_IMAGE_SIG(_addbg, int effects, int bg_color) +AZRP_IMAGE_SIG(_dye, int effects, int dye_color) + +#define azrp_image_rgb16_effect(x, y, img, eff, ...) \ + azrp_subimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) +#define azrp_image_p8_effect(x, y, img, eff, ...) \ + azrp_subimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) +#define azrp_image_p4_effect(x, y, img, eff, ...) \ + azrp_subimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \ + eff, ##__VA_ARGS__) + +#undef AZRP_IMAGE_SIG +#undef AZRP_IMAGE_SIG1 + +/* Main loop provided by Azur; as usual, these are not real functions; their + only use is as the [.loop] field of a command. */ + +void azrp_image_shader_rgb16_normal(void); +void azrp_image_shader_rgb16_clearbg(void); +void azrp_image_shader_rgb16_swapcolor(void); +void azrp_image_shader_rgb16_dye(void); +void azrp_image_shader_p8_normal(void); +void azrp_image_shader_p8_swapcolor(void); +void azrp_image_shader_p4_normal(void); +void azrp_image_shader_p4_clearbg(void); AZUR_END_DECLS diff --git a/azur/src/gint/r61524.s b/azur/src/gint/r61524.s new file mode 100644 index 0000000..e3ff4b7 --- /dev/null +++ b/azur/src/gint/r61524.s @@ -0,0 +1,65 @@ +.section .ilram, "ax" + +.balign 4 +.global _azrp_r61524_fragment_x1 +_azrp_r61524_fragment_x1: + mov.l .R61524_DATA, r2 + shlr r5 + + ldrs 1f + ldre 2f + ldrc r5 + nop + + /* Read a word from XRAM */ +1: mov.l @r4+, r0 + /* Write that word to the display */ +2: mov.l r0, @r2 + + rts + nop + +.balign 4 +.global _azrp_r61524_fragment_x2 +_azrp_r61524_fragment_x2: + mov.l .R61524_DATA, r2 + nop + + /* Read a word, write it twice */ + ldrs 1f + ldre 2f + ldrc r5 + nop + +1: mov.w @r4+, r0 + nop + mov.w r0, @r2 + nop + mov.w r0, @r2 +2: nop + + sub r5, r4 + sub r5, r4 + + /* Do that again on a second line */ + ldrs 3f + ldre 4f + ldrc r5 + nop + +3: mov.w @r4+, r0 + nop + mov.w r0, @r2 + nop + mov.w r0, @r2 +4: nop + + dt r6 + bf _azrp_r61524_fragment_x2 + + rts + nop + +.balign 4 +.R61524_DATA: + .long 0xb4000000 diff --git a/azur/src/gint/render.c b/azur/src/gint/render.c index 736796e..5ccb39a 100644 --- a/azur/src/gint/render.c +++ b/azur/src/gint/render.c @@ -7,11 +7,8 @@ #include #include -#define YRAM ((void *)0xe5017000) - -/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM. - TODO: Extend this to 16 rows, and move the rest to RAM */ -GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8]; +/* 16 rows of video memory, occupying 12736/16384 bytes or XYRAM (77.7%). */ +uint16_t *azrp_frag = (void *)0xe500e000 + 32; /* Super-scaling factor, width and height of output. */ int azrp_scale; @@ -22,27 +19,33 @@ int azrp_frag_count; /* Height of fragment. */ int azrp_frag_height; -/* TODO: Either make command queue private or use azrp_ prefix */ - /* Number and total size of queued commands. */ -GXRAM int commands_count = 0, commands_length = 0; +static int commands_count=0, commands_length=0; -/* Array of pointers to queued commands (stored as an offset into YRAM). */ -GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS]; +/* Array of pointers to queued commands. Each command has: + * Top 16 bits: fragment number + * Bottom 16 bits: offset into command data buffer + Rendering order is integer order. */ +static uint32_t commands_array[AZRP_MAX_COMMANDS]; + +static GALIGNED(4) uint8_t commands_data[8192]; /* Array of shader programs and uniforms. */ -GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL }; -GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL }; +static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL }; +static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL }; /* Next free index in the shader program array. */ -GXRAM static uint16_t shaders_next = 0; +static uint16_t shaders_next = 0; + +/* Hooks. */ +static azrp_hook_prefrag_t *azrp_hook_prefrag = NULL; /* Performance counters. */ -GXRAM prof_t azrp_perf_cmdgen; -GXRAM prof_t azrp_perf_sort; -GXRAM prof_t azrp_perf_shaders; -GXRAM prof_t azrp_perf_r61524; -GXRAM prof_t azrp_perf_render; +prof_t azrp_perf_cmdgen; +prof_t azrp_perf_sort; +prof_t azrp_perf_shaders; +prof_t azrp_perf_r61524; +prof_t azrp_perf_render; //--- // High and low-level pipeline functions @@ -110,25 +113,23 @@ void azrp_render_fragments(void) while(1) { while(cmd < next_frag_threshold && i < commands_count) { azrp_commands_total++; - uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff); + uint8_t *data = commands_data + (cmd & 0xffff); prof_enter_norec(azrp_perf_shaders); shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag); prof_leave_norec(azrp_perf_shaders); - - if(data[0] == AZRP_SHADER_IMAGE) { - struct azrp_shader_image_command *cmd = (void *)data; - cmd->height -= cmd->lines; - cmd->input += cmd->row_stride * cmd->lines; - cmd->lines = min(cmd->height, azrp_frag_height); - cmd->output = 2 * cmd->x; - } - cmd = commands_array[++i]; } - /* TODO: Consider xram_frame() by DMA in parallel? */ + if(azrp_hook_prefrag) { + int size = azrp_width * azrp_frag_height * 2; + (*azrp_hook_prefrag)(frag, azrp_frag, size); + } + prof_enter_norec(azrp_perf_r61524); - xram_frame(azrp_frag, 396 * 8); + if(azrp_scale == 1) + azrp_r61524_fragment_x1(azrp_frag, 396 * azrp_frag_height); + else if(azrp_scale == 2) + azrp_r61524_fragment_x2(azrp_frag, azrp_width, azrp_frag_height); prof_leave_norec(azrp_perf_r61524); if(++frag >= azrp_frag_count) break; @@ -149,10 +150,12 @@ void azrp_update(void) // Configuration calls //--- +// TODO: Use larger fragments in upscales x2 and x3 + static void update_frag_count(void) { if(azrp_scale == 1) - azrp_frag_count = 28 + (azrp_frag_offset > 0); + azrp_frag_count = 14 + (azrp_frag_offset > 0); else if(azrp_scale == 2) azrp_frag_count = 7 + (azrp_frag_offset > 0); else if(azrp_scale == 3) @@ -162,7 +165,7 @@ static void update_frag_count(void) static void update_size(void) { if(azrp_scale == 1) - azrp_width = 396, azrp_height = 198, azrp_frag_height = 8; + azrp_width = 396, azrp_height = 224, azrp_frag_height = 16; else if(azrp_scale == 2) azrp_width = 198, azrp_height = 112, azrp_frag_height = 16; else if(azrp_scale == 3) @@ -194,6 +197,20 @@ static void default_settings(void) azrp_config_scale(1); } +//--- +// Hooks +//--- + +azrp_hook_prefrag_t *azrp_hook_get_prefrag(void) +{ + return azrp_hook_prefrag; +} + +void azrp_hook_set_prefrag(azrp_hook_prefrag_t *hook) +{ + azrp_hook_prefrag = hook; +} + //--- // Custom shaders //--- @@ -226,7 +243,7 @@ bool azrp_queue_command(void *command, size_t size, int fragment, int count) if(commands_length + size >= 8192) return false; - uint8_t *dst = YRAM + commands_length; + uint8_t *dst = commands_data + commands_length; uint8_t *src = command; for(size_t i = 0; i < size; i++) diff --git a/azur/src/gint/shaders/image.S b/azur/src/gint/shaders/image.S deleted file mode 100644 index d7dc298..0000000 --- a/azur/src/gint/shaders/image.S +++ /dev/null @@ -1,727 +0,0 @@ -/* Azur's built-in shaders: - - If there ever was a fantastic piece of assembler engineering in my work up - to this point, this would be it. Every trick in the book is used here, from - clever instruction combinations, pipeline flow and tricky DSP abuse all the - way up to memory layout planning, transforms on loop structures, and most - critically superscalar parallelism. - - While the performance of the shader is not *strictly* proportional to the - speed of the tightest loop, it's very close. The use of operand-bus XRAM for - graphics data, systematic alignment, and detailed pipeline stalling - measurements for common instruction sequences in gintctl allow very accurate - speed predictions to be made based on the tightness of the code. - - The palette formats of bopti have been refined for the purpose of this - shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes, - and P4 being renamed P4_RGB565A with minimal changes along with a variation - aptly named P4_RGB565. - - The asymptotic performance for each format is as follows: - * RGB565: 1 cycle/pixel if source and destination align - 2 cycles/pixel otherwise - * RGB565A: 4 cycles/pixel - * P8_RGB565A: 4.5 cycles/pixel - * P8_RGB565: 3 cycles/pixel - * P4_RGB565A: 5 cycles/pixel - * P4_RGB565: 3.5 cycles/pixel - - Entirely documenting this code would take me hours, but some elements are - provided in the comments. Superscalar parallelism is most easily appreciated - by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main - structural technique at play in this code is loop transforms. - - Basically, a loop that loads a pixel, performs computations with it, and - writes the result is inefficient because of the RAW dependencies on most - operations (with full stall cycles between loads and computations, and - between computations and uses as addresses). Well-established loop - optimization literature has lots of techniques to help with this problem, - and I use two here: - - * _Pipelining_ the loop consists in handling a single pixel over several - iterations by doing a little bit of work in each iteration. The data for - the pixel would move from register to register at each iteration, with the - loop code doing one stage's worth of computation on each register. (You - can view it as a diagonal iteration pattern in the pixel*instruction grid - if you like such visualizations.) - - By increasing the number of pixels in the pipeline, a lot of independent - data can be obtained, reducing dependency pressure and allowing for - greater parallelism at the cost of more registers being used. - - The use of pipelining in this shader is very modest, with 2 stages at - most, and usually only a couple of instructions being performed in advance - for the next pixel while the current one finishes processing. Register - assignments have some subtleties though since pressure is high overall. - - * _Unrolling_ iterations of the loop consists in loading two (or more) - pixels at the start of each iteration so that we can work on one while - waiting for stalls and dependencies on the other. - - Unlike pipelining, a loop iteration starts and ends with full pixels and - no work carries between iterations. Unrolling allows different pixels to - use different registers and generally better optimize the instruction - sequence, at the cost of only supporting pixel counts that are multipes of - the unrolling level. - - Handling non-multiple sizes is the everlasting bane of unrolled loops, - sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4 - to only handle even sizes and neutralize unwanted pixels after the fact. - - Both techniques are used simultaneously, with 2-unrolled 2-stage loops for - almost all formats (except RGB556A which performs DSP trickery). -*/ - -.global _azrp_shader_image -.align 4 - -/* Register assignment - r0: (temporary) - r1: Lines - r2: Command queue; (temporary) - r3: Input - r4: [parameter] azrp_width*2; output stride - r5: [parameter] Command queue; Output - r6: [parameter] azrp_frag; alpha value; (temporary) - r7: Columns - r8: Image pointer; (temporary) - r9: Input stride */ -_azrp_shader_image: - mov.l r8, @-r15 - add #2, r5 - - mov.l r9, @-r15 - mov r5, r2 - - mov.w @r2+, r7 /* command.columns */ - - mov.l @r2+, r8 /* command.image */ - - mov.w @r2+, r5 /* command.output (offset) */ - sub r7, r4 - - mov.w @r8+, r9 /* image.profile */ - sub r7, r4 - - mov.w @r2+, r1 /* command.lines */ - add r6, r5 - - mov.l @r2+, r3 /* command.input (pointer) */ - shll2 r9 - - mova .formats, r0 - - mov.w @r8+, r6 /* image.alpha */ - - mov.l @(r0,r9), r0 - - mov.w @r8+, r9 /* image.width */ - - jmp @r0 - nop - -.align 4 -.formats: - .long _RGB565 - .long _RGB565A - .long _NOP /* P8 */ - .long _P4_RGB565A /* =P4 */ - .long _P8_RGB565 - .long _P8_RGB565A - .long _P4_RGB565 - -/* [Loop macros] - - The following macros implement the main loop of the image renderer. - * Each line is rendered in the tight loop between 2: and 3: (both included). - * r5 is the output (with stride r4, in bytes) - * r3 is the input (with stride r9, in bytes) - * There are r1 rows with r7 iterations each */ - -#define START() \ - nop; /* 4-alignment */ \ - ldrs 2f; \ - ldre 3f; \ -1: ldrc r7 - -#define END_NORET() \ - dt r1; \ - add r4, r5; \ - bf.s 1b; \ - add r9, r3 - -#define END() \ - END_NORET(); \ - mov.l @r15+, r9; \ - rts; \ - mov.l @r15+, r8 - -/* [Rendering strategy for the RGB565 format] - - In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can - optimize by moving longwords. Since longwords are pairs of pixels, there are - variations and subcases based on the parity of each parameter: - - * w[eo] denotes whether the width of the image is even or odd; - * d[eo] denotes whether the memory accesses to the source and destination - are even (4-aligned) or odd (2-aligned). - - When the destination and source have identical parity, the d[eo] variation - can be defined. In this case the copy is pretty direct, it's a longword copy - and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the - start or end address is 2-aligned. - - However, when they have opposite parity, each longword read matches up with - a 2-aligned write (or vice-versa). Rearranging words with arithmetic does - not help because of the stall cycle between loading a register and using it - in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as - the word-based copy). Unrolling iterations could help but would be too - complex here (adding sub-cases); a super-heavy renderer with more hypotheses - (like a tileset shader) should aim for that route though. Also, movua.l - followed by mov.l is even slower (5 cycles). */ -.align 4 -_RGB565: - mov #8, r0 /* Maximum width for naive method */ - sub r7, r9 - - cmp/ge r7, r0 - - shll r9 - - bt.s _RGB565.naive - mov #2, r0 - - /* Use naive method for opposite source/destination parity */ - mov r5, r6 - xor r3, r6 - - tst r0, r6 - bf _RGB565.naive - - shlr r7 - bt _RGB565.wo - -_RGB565.we: - tst r0, r5 - bf _RGB565.we_do - -/* This is 4-aligned */ -_RGB565.we_de: - START() -2: movs.l @r3+, x0 -3: movs.l x0, @r5+ - END() - -.align 4 -_RGB565.we_do: - add #-1, r7 - - START() - movs.w @r3+, x0 - movs.w x0, @r5+ - -2: movs.l @r3+, x0 -3: movs.l x0, @r5+ - - movs.w @r3+, x0 - movs.w x0, @r5+ - END() - -.align 4 -_RGB565.wo: - tst r0, r5 - bf _RGB565.wo_do - -_RGB565.wo_de: - START() -2: movs.l @r3+, x0 -3: movs.l x0, @r5+ - - movs.w @r3+, x0 - movs.w x0, @r5+ - END() - -.align 4 -_RGB565.wo_do: - START() - movs.w @r3+, x0 - movs.w x0, @r5+ - -2: movs.l @r3+, x0 -3: movs.l x0, @r5+ - END() - -/* Naive method for small widths and opposite source/destination parity */ -.align 4 -_RGB565.naive: - START() -2: movs.w @r3+, x0 -3: movs.w x0, @r5+ - END() - -/* [Rendering strategy for the RGB565A format] - - Since we have to check for the alpha value in each pixel, there's really no - longword-based optimization. Instead, we just go as fast as possible with - each pixel, using DSP instructions because conditional execution is pretty - damn good. This takes 4 cycles/pixel. I tried a number of reductions to - 3 cycles/pixel but could not get any of them to work. */ -.align 4 -_RGB565A: - shll16 r6 - mov #0x0004, r0 /* DC Zero mode */ - - sub r7, r9 - - shll r9 - - lds r6, y0 - - lds r0, dsr - - START() -2: movs.w @r3+, x0 - pcmp x0, y0 movx.w @r5, x1 - dct pcopy x1, x0 -3: movx.w x0, @r5+ - END() - -/* [Rendering strategy for the P8_RGB565A format] - - The work needed for each pixel gets more difficult as we go, with alpha - being the major culprit due to its additional comparisons, jumps, and - limited optimization opportunities when unrolling due to conditionally- - executed code. - - Because arithmetic is unavoidable and there are 1-cycle delays between both - loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled - iterations with a 2-stage pipeline structure. This fills the stall cycles - and increases parallelism significantly. Pure loop optimization handbook. - - Dealing with odd widths is a major pain as usual. Instead of adding logic to - handle the extra pixel separately, this routine lets the loop overwrite it, - then restores its original value afterwards - a delightfully elegant trick. - - The P8 format is actually so bad that spending precious time grinding cycles - felt completely inappropriate without first refining it. This led to two new - variations, P8_RGB565 and P8_RGB565A, which fix the following problems. - - -> First there is alpha for all images, which is the most costly feature, - single-handedly accounting for half of the work per pixel. P8_RGB565 - does no support alpha, which basically doubles performance. - - -> Then, there is the alpha value itself. In P8 it is a variable (and fxconv - sets it to 0xff), which burns a register for the comparison and enforces - a fixed order between comparison and left-shift. P8_RGB565A always sets - an alpha value of 0x00 which lifts both constraints. - - -> Then, there are palette indices. In P8 they are unsigned, which requires - an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign- - extended value of the mov.b can be used directly (once doubled). The - palette base is simply offset by 128 entries, with colors numbered - -128..-1 first and only then 0..127. - - -> Finally, there's the palette itself. In P8 it always has 256 entries, - even when only a few are used. For small images this is a huge waste, so - P8_RGB565 and P8_RGB565A only store colors that are actually used. - - P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good - compared to 4 cycles/pixel for RGB565A. */ -.align 4 -_P8_RGB565A: - mov.l r13, @-r15 - sub r7, r9 - - mov r7, r13 - add #-2, r9 /* Input stride compensation for pipelining */ - - mov.l r12, @-r15 - shlr r7 - - mov.l r10, @-r15 - movt r6 - - mov.w _P8_RGB565A.palette_distance, r0 - shll r13 - - add r6, r7 - - sub r6, r9 - - sub r6, r4 - - sub r6, r4 - - add r0, r8 - - add r5, r13 - mov r7, r2 - - add #-4, r5 /* Output offset compensation in the loop */ - - shll2 r2 - - add r4, r2 - - START() - - mov.b @r3+, r6 - - /* Save next pixel for the odd-width case */ - mov.w @r13, r12 - - mov.b @r3+, r10 - tst r6, r6 - - /* 2-unrolled 2-stage main loop */ -2: add r6, r6 - mov r6, r0 - - add r10, r10 - bt.s 5f - - tst r10, r10 - mov.w @(r0,r8), r0 - - mov.w r0, @(4,r5) - - 5: mov.b @r3+, r6 - mov r10, r0 - - bt.s 6f - add #4, r5 - - mov.w @(r0,r8), r0 - - mov.w r0, @(2,r5) - - 6: mov.b @r3+, r10 -3: tst r6, r6 - - /* Restore last pixel */ - mov.w r12, @r13 - add r2, r13 - - END_NORET() - mov.l @r15+, r10 - mov.l @r15+, r12 - mov.l @r15+, r13 - mov.l @r15+, r9 - rts - mov.l @r15+, r8 - -_P8_RGB565A.palette_distance: - /* Distance between image pointer and palette array base */ - .word 260 - -/* [Rendering strategy for the P8_RGB565 format] - - See P8_RGB565A for format details. Removing the checks for transparency and - the jumps simplifies the instruction sequence and allows superior - parallelism because all paths are unconditional. This routines achieves - 3 cycles/pixel asymptotically. */ -.align 4 -_P8_RGB565: - mov.l r13, @-r15 - sub r7, r9 - - mov r7, r13 - add #-2, r9 /* Input stride compensation for pipelining */ - - mov.l r12, @-r15 - shlr r7 - - mov.l r10, @-r15 - movt r6 - - mov.w _P8_RGB565.palette_distance, r0 - shll r13 - - add r6, r7 - - sub r6, r9 - - sub r6, r4 - - sub r6, r4 - - add r0, r8 - - add r5, r13 - - add #-4, r5 /* Output offset compensation in the loop */ - mov r7, r2 - - shll2 r2 - - add r4, r2 - - START() - - mov.b @r3+, r0 - - /* Save next pixel for the odd-width case */ - mov.w @r13, r12 - - mov.b @r3+, r10 - shll r0 - - /* 2-unrolled 2-stage main loop */ -2: mov.b @r3+, r6 - shll r10 - - mov.w @(r0,r8), r0 - /* This nop is not for show, it actually prevents the loop from slowing - down to 7 cycles /i, probably due to instruction reads alignment. */ - nop - - mov.w r0, @(4,r5) - mov r10, r0 - - mov.b @r3+, r10 - add #4, r5 - - mov.w @(r0,r8), r0 - shll r6 - - mov.w r0, @(2,r5) -3: mov r6, r0 - - /* Restore last pixel */ - mov.w r12, @r13 - add r2, r13 - - END_NORET() - mov.l @r15+, r10 - mov.l @r15+, r12 - mov.l @r15+, r13 - mov.l @r15+, r9 - rts - mov.l @r15+, r8 - -_P8_RGB565.palette_distance: - /* Distance between image pointer and palette array base */ - .word 260 - -/* [Rendering strategy for the P4_RGB565A format] - - This is the most complex format. Most of the remarks that apply to - P8_RGB565A also apply here, except that there are less opportunities to save - computation because nibbles must be extracted anyway. - - The P4_RGB565A format is simply bopti's P4, but an additional variation - P4_RGB565 is specified to save on transparency handling, which is very - expensive. - - The special nature of the nibble packing means the simplest loop form writes - 2 pixels from a 2-aligned source image position in a single iteration. Other - structures don't even come close: selecting nibbles individually is folly, - while not unrolling is inefficient. So the whole point of this routine is to - forcibly align the subimage on a byte-aligned and never break that grid. - - The command builder for P4 does this alignment before submitting the - command. Obviously the transform can cause one extra pixel to be overridden - on each side of every line. The command is thus extended with two edge - offsets indicating pixels to preserve at each end. When overwrites occurs, - the edge offsets point to the overwritten pixels so they can be restored. - Otherwise, they point to the next pixels and the restores are no-ops. See - the strategy used for managing unrolling in P8 formats for details. - - The only irregularity is image width, which the command builder cannot - modify. It is rounded up to the next multiple of 2, then halved. There is a - nice trick for this operation, which is [shlr rX] then adding T to rX. We - also need to add -1 for another adjustement, and both are combined into an - addc, which saves one add and one movt off the EX critical chain. - - The main loop achieves 5 cycles/pixel. */ -.align 4 -_P4_RGB565A: - shlr r7 - mov.w @(6, r2), r0 /* command.edge2 */ - - mov.l r12, @-r15 - add #-15, r2 /* Go back to start of command */ - - mov #-1, r12 - shlr r9 - - mov.l r11, @-r15 - addc r12, r9 - - mov r0, r12 - add r12, r12 - - mov.l r10, @-r15 - sub r7, r9 - - mov.b @r2, r11 /* command.edge1 */ - add #2, r8 /* image.palette */ - - mov.l r13, @-r15 - mov r5, r0 - - mov.l r14, @-r15 - shll r11 - - add #-4, r5 - nop /* 4-alignment */ - - START() - - mov.b @r3+, r6 - mov r0, r10 - - mov.w @(r0,r11), r13 - - mov.w @(r0,r12), r14 - shll r6 - - /* Main loop with 2 pixels sharing a single byte */ -2: mov r6, r0 - and #0x1e, r0 - - tst r0, r0 - - bt.s 4f - shlr2 r6 - - mov.w @(r0,r8), r0 - - mov.w r0, @(6,r5) - 4: shlr2 r6 - - mov r6, r0 - and #0x1e, r0 - - tst r0, r0 - mov.b @r3+, r6 - - bt.s 5f - add #4, r5 - - mov.w @(r0,r8), r0 - - mov.w r0, @r5 -3: 5: shll r6 - - mov r10, r0 - mov r7, r10 - - shll2 r10 - - mov.w r13, @(r0,r11) - add r4, r10 - - mov.w r14, @(r0,r12) - add r0, r10 - - mov r10, r0 - /* Parallelizes with [dt r1] expanded from END_NORET() */ - - END_NORET() - mov.l @r15+, r14 - mov.l @r15+, r13 - mov.l @r15+, r10 - mov.l @r15+, r11 - mov.l @r15+, r12 - mov.l @r15+, r9 - rts - mov.l @r15+, r8 - -/* [Rendering strategy for the P4_RGB565 format] - Same as P4_RGB565A without transparency checks (fairly straightforward). The - core loop runs in 3.5 cycles/pixel. */ -.align 4 -_P4_RGB565: - shlr r7 - mov.w @(6, r2), r0 /* command.edge2 */ - - mov.l r10, @-r15 - add #-15, r2 /* Go back to start of command */ - - mov.l r12, @-r15 - shlr r9 - - add #2, r8 /* image.palette */ - mov #-1, r12 - - mov.l r11, @-r15 - addc r12, r9 - - mov r0, r12 - add r12, r12 - - mov.b @r2, r11 /* command.edge1 */ - sub r7, r9 - - mov.l r13, @-r15 - mov #0x1e, r2 - - mov.l r14, @-r15 - shll r11 - - mov r5, r0 - add #-4, r5 - - START() - - mov.b @r3+, r6 - mov #-4, r10 - - mov.l r0, @-r15 - - mov.w @(r0,r11), r13 - - mov.w @(r0,r12), r14 - shll r6 - - /* Main loop with 2 pixels sharing a single byte */ -2: mov r6, r0 - and #0x1e, r0 - - shld r10, r6 - - mov.w @(r0,r8), r0 - and r2, r6 - - mov.w r0, @(6,r5) - mov r6, r0 - - mov.b @r3+, r6 - add #4, r5 - - mov.w @(r0,r8), r0 - - mov.w r0, @r5 -3: shll r6 - - mov.l @r15+, r0 - mov r7, r10 - - shll2 r10 - - mov.w r13, @(r0,r11) - add r4, r10 - - mov.w r14, @(r0,r12) - add r0, r10 - - mov r10, r0 - /* Parallelizes with [dt r1] expanded from END_NORET() */ - - END_NORET() - mov.l @r15+, r14 - mov.l @r15+, r13 - mov.l @r15+, r11 - mov.l @r15+, r12 - mov.l @r15+, r10 - mov.l @r15+, r9 - rts - mov.l @r15+, r8 - -/* [Unsupported formats] - P8 is unsupported, use P8_RGB565 and P8_RGB565A. */ -_NOP: - mov.l @r15+, r9 - rts - mov.l @r15+, r8 diff --git a/azur/src/gint/shaders/image.c b/azur/src/gint/shaders/image.c index 0ef5294..82e439d 100644 --- a/azur/src/gint/shaders/image.c +++ b/azur/src/gint/shaders/image.c @@ -1,88 +1,45 @@ #include #include -uint8_t AZRP_SHADER_IMAGE = -1; - -__attribute__((constructor)) -static void register_shader(void) +void azrp_queue_image(struct gint_image_box *box, image_t const *img, + struct gint_image_cmd *cmd) { - extern azrp_shader_t azrp_shader_image; - AZRP_SHADER_IMAGE = azrp_register_shader(azrp_shader_image); -} - -void azrp_shader_image_configure(void) -{ - azrp_set_uniforms(AZRP_SHADER_IMAGE, (void *)(2 * azrp_width)); -} - -//--- - -/* Profile IDs */ -#define RGB565 0 -#define RGB565A 1 -#define P4_RGB565A 3 -#define P8_RGB565 4 -#define P8_RGB565A 5 -#define P4_RGB565 6 - -void azrp_image(int x, int y, bopti_image_t const *image) -{ - azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0); -} - -void azrp_subimage(int x, int y, bopti_image_t const *image, - int left, int top, int width, int height, int flags) -{ - prof_enter(azrp_perf_cmdgen); - - if(!(flags & DIMAGE_NOCLIP)) { - /* TODO: image: clip function */ - } - - struct azrp_shader_image_command cmd; - cmd.shader_id = AZRP_SHADER_IMAGE; - cmd.columns = width; - cmd.image = image; - - int row_stride; - - if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) { - row_stride = image->width; - cmd.input = (void *)image->data + (image->data[0] * 2) + 2 + - top * row_stride + left; - } - else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) { - row_stride = (image->width + 1) >> 1; - cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1); - - int odd_left = left & 1; - int odd_right = (left + width) & 1; - - cmd.edge1 = -1 + odd_left; - cmd.edge2 = width + odd_left; - cmd.columns += odd_left + odd_right; - x -= odd_left; - } - else { - row_stride = image->width << 1; - cmd.input = (void *)image->data + top * row_stride + (left << 1); - } + /* TODO: Ironically, this loads all 3 entry points */ + int p = img->profile; + if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) + cmd->shader_id = AZRP_SHADER_IMAGE_RGB16; + else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) + cmd->shader_id = AZRP_SHADER_IMAGE_P8; + else + cmd->shader_id = AZRP_SHADER_IMAGE_P4; /* This divides by azrp_frag_height */ - int fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4); + /* TODO: Have a proper way to do optimized-division by azrp_frag_height */ + int fragment_id = (azrp_scale == 1) ? (box->y >> 4) : (box->y >> 4); /* These settings only apply to the first fragment */ - int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1); - cmd.lines = azrp_frag_height - first_y; - cmd.output = 2 * (azrp_width * first_y + x); + int first_y = (box->y + azrp_frag_offset) & (azrp_frag_height - 1); + cmd->lines = min(box->h, azrp_frag_height - first_y); + cmd->output = (void *)azrp_frag + (azrp_width * first_y + cmd->x) * 2; - /* Settings for further updates */ - cmd.height = height; - cmd.row_stride = row_stride; - cmd.x = x; - - int n = 1 + (height - cmd.lines + azrp_frag_height - 1) / azrp_frag_height; - azrp_queue_command(&cmd, sizeof cmd, fragment_id, n); - - prof_leave(azrp_perf_cmdgen); + int n = 1 + (box->h - cmd->lines + azrp_frag_height-1) / azrp_frag_height; + azrp_queue_command(cmd, sizeof *cmd, fragment_id, n); +} + +void azrp_subimage(int x, int y, image_t const *img, + int left, int top, int width, int height, int flags) +{ + int p = img->profile; + + if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) + return azrp_subimage_rgb16(x, y, img, left, top, width, height, flags); + if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) + return azrp_subimage_p8(x, y, img, left, top, width, height, flags); + if(p == IMAGE_P4_RGB565 || p == IMAGE_P4_RGB565A) + return azrp_subimage_p4(x, y, img, left, top, width, height, flags); +} + +void azrp_image(int x, int y, image_t const *img) +{ + azrp_subimage(x, y, img, 0, 0, img->width, img->height, 0); } diff --git a/azur/src/gint/shaders/image_macros.S b/azur/src/gint/shaders/image_macros.S new file mode 100644 index 0000000..8649621 --- /dev/null +++ b/azur/src/gint/shaders/image_macros.S @@ -0,0 +1,37 @@ +/* mov.wv: Move at a variable offset. This macro is functionally identical to + mov.w \SRC, @(\OFF, \DST) + except that when OFF=0 it simplifies into [mov.w \SRC, @\DST] so that SRC is + not constrained to be r0. */ +.macro mov.wv SRC, OFF, DST + .if (\OFF == 0) + mov.w \SRC, @\DST + .else + mov.w \SRC, @(\OFF, \DST) + .endif +.endm + +/* START: Sets up the inner and outer loop. The outer loop is anything between + the calls to macros START and END, while the inner loop is the code between + labels 2: and 3: (both *INCLUDED*). */ +.macro START + ldrs 2f + ldre 3f +1: ldrc r2 + nop +.endm + +/* END: Finishes the outer loop and adds strides. */ +.macro END + dt r1 + add r4, r3 + bf.s 1b + add r6, r5 +.endm + +/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */ +.macro EPILOGUE + mov.l @r15+, r9 + mov r3, r0 + rts + mov.l @r15+, r8 +.endm diff --git a/azur/src/gint/shaders/image_p4.c b/azur/src/gint/shaders/image_p4.c new file mode 100644 index 0000000..4938c10 --- /dev/null +++ b/azur/src/gint/shaders/image_p4.c @@ -0,0 +1,70 @@ +#include +#include + +uint8_t AZRP_SHADER_IMAGE_P4 = -1; + +static void shader_p4(void *uniforms, void *command, void *fragment) +{ + struct gint_image_cmd *cmd = (void *)command; + cmd->input = gint_image_p4_loop((int)uniforms, cmd); + cmd->height -= cmd->lines; + cmd->lines = min(cmd->height, azrp_frag_height); + cmd->output = fragment + cmd->x * 2; +} + +__attribute__((constructor)) +static void register_shader(void) +{ + AZRP_SHADER_IMAGE_P4 = azrp_register_shader(shader_p4); +} + +void azrp_shader_image_p4_configure(void) +{ + azrp_set_uniforms(AZRP_SHADER_IMAGE_P4, (void *)azrp_width); +} + +void azrp_image_p4(int x, int y, image_t const *img, int eff) +{ + azrp_subimage_p4(x, y, img, 0, 0, img->width, img->height, eff); +} + +void azrp_subimage_p4(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_P4_RGB565A) + return azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width, + azrp_height)) { + cmd.loop = azrp_image_shader_p4_normal; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_p4_clearbg(int x, int y, image_t const *img, int eff, int bg) +{ + azrp_subimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg); +} + +void azrp_subimage_p4_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = gint_image_p4_clearbg_alt; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_p4_dye.c b/azur/src/gint/shaders/image_p4_dye.c new file mode 100644 index 0000000..f9a860f --- /dev/null +++ b/azur/src/gint/shaders/image_p4_dye.c @@ -0,0 +1,26 @@ +#include + +void azrp_image_p4_dye(int x, int y, image_t const *img, int eff, + int dye_color) +{ + azrp_subimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void azrp_subimage_p4_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 4; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = gint_image_p4_dye; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_p4_effect.c b/azur/src/gint/shaders/image_p4_effect.c new file mode 100644 index 0000000..8df2935 --- /dev/null +++ b/azur/src/gint/shaders/image_p4_effect.c @@ -0,0 +1,31 @@ +#include + +void azrp_subimage_p4_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int from = va_arg(args, int); + int to = va_arg(args, int); + azrp_subimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from, to); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + azrp_subimage_p4_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + azrp_subimage_p4_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + azrp_subimage_p4(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/azur/src/gint/shaders/image_p4_normal.S b/azur/src/gint/shaders/image_p4_normal.S new file mode 100644 index 0000000..b41db6f --- /dev/null +++ b/azur/src/gint/shaders/image_p4_normal.S @@ -0,0 +1,119 @@ +.global _azrp_image_shader_p4_normal +#include "image_macros.S" + +/* P4 Opaque rendering, Azur version: trivial with loop transforms. + + This is a pretty direct loop with no difficult tricks involved; it expands + on P8 by adding another edge pointer. The main change is the decoding logic + which now only involves a single byte to load for every two pixels, but more + arithmetic to extract the nibbles. + + All the loops in Azur's P4 functions are obvious EX chains and thus any + optimization would need to simplify the arithmetic to gain any half-cycles. + + r0: [temporary] + r7: Right edge pointer + r8: Right edge value + r9: Palette + r10: Left edge pointer + r11: Left edge value + r12: Edge stride + r13: [temporary] + r14: [temporary] */ + +.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + shlr r2 + nop + + add r10, r10 + nop + + mov.l @r8+, r9 /* cmd.palette */ + mov r2, r0 + + mov.w @r8+, r7 /* cmd.edge_2 */ + shll2 r0 + + mov.l r12, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + mov.l r13, @-r15 + add r5, r10 + + mov.l r14, @-r15 + add #-4, r5 + + add #-1, r4 /* Input stride compensation for pipelining */ + nop + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + START + + mov.b @r3+, \TMP1 + mov #-4, \TMP2 + + mov.w @r7, r8 /* Save right edge */ + nop + + mov.w @r10, r11 /* Save left edge */ + shll \TMP1 + +2: mov \TMP1, r0 + and #0x1e, r0 + + shld \TMP2, \TMP1 + mov #0x1e, \TMP2 + + mov.w @(r0,r9), r0 + and \TMP2, \TMP1 + + mov.w r0, @(\OFF1,r5) + mov \TMP1, r0 + + mov.b @r3+, \TMP1 + add #\OUT_DIR, r5 + + mov.w @(r0,r9), r0 + mov #-4, \TMP2 + + mov.w r0, @(\OFF2,r5) +3: shll \TMP1 + + mov.w r8, @r7 /* Restore right edge */ + add r12, r7 + + mov.w r11, @r10 /* Restore left edge */ + add r12, r10 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r11 + mov.l @r15+, r12 + mov.l @r15+, r10 + EPILOGUE +.endm + +_azrp_image_shader_p4_normal: + tst #1, r0 + bf 9f + + GEN_NORMAL_LOOP 0, 4, r13, r14, 6, 0 +9: GEN_NORMAL_LOOP 1, -4, r13, r14, 0, 6 diff --git a/azur/src/gint/shaders/image_p4_swapcolor.c b/azur/src/gint/shaders/image_p4_swapcolor.c new file mode 100644 index 0000000..4464027 --- /dev/null +++ b/azur/src/gint/shaders/image_p4_swapcolor.c @@ -0,0 +1,51 @@ +#include + +void azrp_image_p4_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + azrp_subimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void azrp_subimage_p4_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_index, int new_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = old_index; + cmd.color_2 = new_color; + cmd.loop = gint_image_p4_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_p4_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + azrp_subimage_p4_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void azrp_subimage_p4_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = gint_image_p4_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_p8.c b/azur/src/gint/shaders/image_p8.c new file mode 100644 index 0000000..34cd47c --- /dev/null +++ b/azur/src/gint/shaders/image_p8.c @@ -0,0 +1,71 @@ +#include +#include + +uint8_t AZRP_SHADER_IMAGE_P8 = -1; + +static void shader_p8(void *uniforms, void *command, void *fragment) +{ + struct gint_image_cmd *cmd = (void *)command; + cmd->input = gint_image_p8_loop((int)uniforms, cmd); + cmd->height -= cmd->lines; + cmd->lines = min(cmd->height, azrp_frag_height); + cmd->output = fragment + cmd->x * 2; +} + +__attribute__((constructor)) +static void register_shader(void) +{ + AZRP_SHADER_IMAGE_P8 = azrp_register_shader(shader_p8); +} + +void azrp_shader_image_p8_configure(void) +{ + azrp_set_uniforms(AZRP_SHADER_IMAGE_P8, (void *)azrp_width); +} + +void azrp_image_p8(int x, int y, image_t const *img, int eff) +{ + azrp_subimage_p8(x, y, img, 0, 0, img->width, img->height, eff); +} + +void azrp_subimage_p8(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_P8_RGB565A) + return azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.loop = azrp_image_shader_p8_normal; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_p8_clearbg(int x, int y, image_t const *img, int eff, int bg) +{ + azrp_subimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff, + bg); +} + +void azrp_subimage_p8_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = gint_image_p8_clearbg; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_p8_dye.c b/azur/src/gint/shaders/image_p8_dye.c new file mode 100644 index 0000000..a0128ee --- /dev/null +++ b/azur/src/gint/shaders/image_p8_dye.c @@ -0,0 +1,26 @@ +#include + +void azrp_image_p8_dye(int x, int y, image_t const *img, int eff, + int dye_color) +{ + azrp_subimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void azrp_subimage_p8_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 4; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = gint_image_p8_dye; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_p8_effect.c b/azur/src/gint/shaders/image_p8_effect.c new file mode 100644 index 0000000..1c264f2 --- /dev/null +++ b/azur/src/gint/shaders/image_p8_effect.c @@ -0,0 +1,31 @@ +#include + +void azrp_subimage_p8_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int from = va_arg(args, int); + int to = va_arg(args, int); + azrp_subimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from, to); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + azrp_subimage_p8_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + azrp_subimage_p8_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + azrp_subimage_p8(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/azur/src/gint/shaders/image_p8_normal.S b/azur/src/gint/shaders/image_p8_normal.S new file mode 100644 index 0000000..1a3e874 --- /dev/null +++ b/azur/src/gint/shaders/image_p8_normal.S @@ -0,0 +1,100 @@ +.global _azrp_image_shader_p8_normal +#include "image_macros.S" + +/* P8 Opaque rendering, Azur version: trivial with loop transforms. + + This is fairly straightforward, with no particular tricks; just index the + palette as fast as possible in a 2-unrolled 2-stage-pipeline loop that maxes + out CPU speed. + + r0: [temporary] + r7: Right edge pointer + r8: Right edge value + r9: Palette + r10: [temporary] + r11: [temporary] + r12: Right edge stride */ + +.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.l @r8+, r9 /* cmd.palette */ + shlr r2 + + mov.w @r8+, r7 /* cmd.edge_2 */ + mov r2, r0 + + mov.l r12, @-r15 + shll2 r0 + + mov.l r10, @-r15 + shll r7 + + mov.l r11, @-r15 + add r5, r7 + + mov r0, r12 + add r6, r12 + + add #-4, r5 + nop + + add #-2, r4 /* Input stride compensation for pipelining */ + nop + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + START + + mov.b @r3+, r0 + nop + + mov.w @r7, r8 /* Save right edge */ + nop + + mov.b @r3+, \TMP1 + shll r0 + +2: mov.b @r3+, \TMP2 + shll \TMP1 + + mov.w @(r0,r9), r0 + /* Fun fact: omitting this nop slows the loop to 7 cycles/i */ + nop + + mov.w r0, @(\OFF1,r5) + mov \TMP1, r0 + + mov.b @r3+, \TMP1 + add #\OUT_DIR, r5 + + mov.w @(r0,r9), r0 + shll \TMP2 + + mov.w r0, @(\OFF2,r5) +3: mov \TMP2, r0 + + mov.w r8, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r11 + mov.l @r15+, r10 + mov.l @r15+, r12 + EPILOGUE +.endm + +_azrp_image_shader_p8_normal: + tst #1, r0 + bf 9f + + GEN_NORMAL_LOOP 0, 4, r10, r11, 4, 2 +9: GEN_NORMAL_LOOP 1, -4, r10, r11, 2, 4 diff --git a/azur/src/gint/shaders/image_p8_swapcolor.S b/azur/src/gint/shaders/image_p8_swapcolor.S new file mode 100644 index 0000000..b6366a5 --- /dev/null +++ b/azur/src/gint/shaders/image_p8_swapcolor.S @@ -0,0 +1,142 @@ +.global _azrp_image_shader_p8_swapcolor +#include "image_macros.S" + +/* P8 SWAPCOLOR, Azur version: by branchless xor selection. + + This is essentially the same logic as gint's P8 SWAPCOLOR version, but with + a 2-unrolled 2-stage-pipeline since the bottleneck on RAM is now on the CPU. + + r0: [temporary] + r7: Right edge pointer + r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y) + r9: Palette + r10: Holds (x ^ y) & -(c == x) during selection + r11: cmd.color_1 + r12: Right edge stride + r13: [temporary] + r14: [temporary] + + Spilled to stack: + @(-4,r15): Right edge value */ + +.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.l @r8+, r9 /* cmd.palette */ + shlr r2 + + mov.w @r8+, r7 /* cmd.edge_2 */ + mov r2, r0 + + mov.l r12, @-r15 + shll2 r0 + + mov.l r11, @-r15 + shll r7 + + mov.w @r8+, r11 /* cmd.color_1 */ + add r5, r7 + + mov.l r10, @-r15 + add #-4, r5 + + mov.l r13, @-r15 + exts.b r11, r11 + + mov r11, r13 + add r13, r13 + + mov.w @r8, r8 /* cmd.color_2 */ + add r9, r13 + + mov r0, r12 + add r6, r12 + + mov.w @r13, r13 + add #-2, r4 /* Input stride compensation for pipelining */ + + mov.l r14, @-r15 + nop + + xor r13, r8 + nop + + .if \HFLIP + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + START + + mov.b @r3+, \TMP2 + nop + + mov.w @r7, r0 /* Save right edge */ + nop + + mov.l r0, @-r15 + cmp/eq \TMP2, r11 + + mov.b @r3+, \TMP1 + add \TMP2, \TMP2 + +2: subc r10, r10 + mov \TMP2, r0 + + cmp/eq \TMP1, r11 + mov.w @(r0, r9), r0 + + and r8, r10 + nop + + xor r10, r0 + nop + + mov.w r0, @(\OFF1, r5) + add #\OUT_DIR, r5 + + mov.b @r3+, \TMP2 + subc r10, r10 + + add \TMP1, \TMP1 + mov \TMP1, r0 + + mov.w @(r0, r9), r0 + cmp/eq \TMP2, r11 + + mov.b @r3+, \TMP1 + and r8, r10 + + xor r10, r0 + nop + + mov.w r0, @(\OFF2, r5) +3: add \TMP2, \TMP2 + + /* TODO: Use x0 as temporary storage by moving the main registers */ + mov.l @r15+, r0 + nop + + mov.w r0, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r10 + mov.l @r15+, r11 + mov.l @r15+, r12 + EPILOGUE +.endm + +_azrp_image_shader_p8_swapcolor: + tst #1, r0 + bf 9f + + GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 4, 2 +9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 2, 4 diff --git a/azur/src/gint/shaders/image_p8_swapcolor.c b/azur/src/gint/shaders/image_p8_swapcolor.c new file mode 100644 index 0000000..ddb8da9 --- /dev/null +++ b/azur/src/gint/shaders/image_p8_swapcolor.c @@ -0,0 +1,51 @@ +#include + +void azrp_image_p8_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + azrp_subimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void azrp_subimage_p8_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_index, int new_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = old_index; + cmd.color_2 = new_color; + cmd.loop = azrp_image_shader_p8_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_p8_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + azrp_subimage_p8_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void azrp_subimage_p8_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = azrp_image_shader_p8_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_rgb16.c b/azur/src/gint/shaders/image_rgb16.c new file mode 100644 index 0000000..f7b4f82 --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16.c @@ -0,0 +1,71 @@ +#include +#include + +uint8_t AZRP_SHADER_IMAGE_RGB16 = -1; + +static void shader_rgb16(void *uniforms, void *command, void *fragment) +{ + struct gint_image_cmd *cmd = (void *)command; + cmd->input = gint_image_rgb16_loop((int)uniforms, cmd); + cmd->height -= cmd->lines; + cmd->lines = min(cmd->height, azrp_frag_height); + cmd->output = fragment + cmd->x * 2; +} + +__attribute__((constructor)) +static void register_shader(void) +{ + AZRP_SHADER_IMAGE_RGB16 = azrp_register_shader(shader_rgb16); +} + +void azrp_shader_image_rgb16_configure(void) +{ + azrp_set_uniforms(AZRP_SHADER_IMAGE_RGB16, (void *)azrp_width); +} + +void azrp_image_rgb16(int x, int y, image_t const *img, int eff) +{ + azrp_subimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff); +} + +void azrp_subimage_rgb16(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff) +{ + if(img->profile == IMAGE_RGB565A) + return azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, + img->alpha); + + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, false, &cmd, azrp_width, + azrp_height)) { + cmd.loop = azrp_image_shader_rgb16_normal; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_rgb16_clearbg(int x, int y, image_t const *img, int eff, int bg) +{ + azrp_subimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff, + bg); +} + +void azrp_subimage_rgb16_clearbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 4; + cmd.color_1 = bg_color; + cmd.loop = azrp_image_shader_rgb16_clearbg; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_rgb16_clearbg.S b/azur/src/gint/shaders/image_rgb16_clearbg.S new file mode 100644 index 0000000..2996159 --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_clearbg.S @@ -0,0 +1,135 @@ +.global _azrp_image_shader_rgb16_clearbg +#include "image_macros.S" + +/* RGB16 CLEARBG and DYE, Azur version: by NULL canceling. + + This function handles both CLEARBG and DYE, which happen to work identically + on RGB16, save for the fact that the DYE loop ignores the value of opaque + pixels and uses the dye color instead. It's one of the standard 2-unrolled + 2-stage-pipeline loops with a right edge, using NULL canceling for + transparency. + + r0: [temporary] (CLEARBG) or dye value (DYE) + r7: Right edge pointer + r8: Right edge value + r9: Background color + r10: Nullable output pointer + r11: 0 (to neutralize addc during NULL-cancelling) + r12: Right edge stride + r13: [temporary] (one of the pixels) + r14: [temporary] (one of the pixels in DYE) + + The GEN_CLEARBG_LOOP macro parameters are as follows. All of them except for + SRC1 and SRC2 are determined by HFLIP; it's just simpler to set their values + on the macro's call site than have .if statements everywhere. This set of + parameters is used for virtually all the functions of all the formats. + + SRC1 and SRC2 are used in DYE mode to replace the pixel values read from + memory with a constant register. + + HFLIP: Whether to enable HFLIP + OUT_DIR: Variation of r5 at each loop, either 4 or -4 + TMP1: Temporary register for first pixel + TMP2: Temporary register for second pixel + OFF1: Offset for first pixel write + OFF2: Offset for second pixel write + SRC1: Source of first write (here either TMP1 or r0) + SRC2: Source of second write (here either TMP2 or r0) */ + +.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2, SRC1, SRC2 + mov.w @r8+, r7 /* cmd.edge_2 */ + shlr r2 + + mov.l r11, @-r15 + mov #0, r11 + + mov.w @r8+, r9 /* cmd.color_1 */ + shll r7 + + mov.l r10, @-r15 + add r5, r7 + + mov.l r12, @-r15 + add #-2, r5 /* Pre-decrement, see output logic */ + + mov r2, r12 + shll2 r12 + + mov.l r13, @-r15 + add r6, r12 + + mov.l r14, @-r15 + add #-2, r4 /* Input stride compensation for pipelining */ + + .if \HFLIP + mov r2, r0 + shll2 r0 + + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + mov.w @r8+, r0 /* cmd.color_2 */ + nop + + START + + mov.w @r3+, \TMP1 + nop + + mov.w @r7, r8 /* Save right edge */ + nop + + cmp/eq \TMP1, r9 + nop + +2: mov #-1, r10 + addc r11, r10 + + mov.w @r3+, \TMP2 + and r5, r10 + + add #\OUT_DIR, r5 + nop + + mov.wv \SRC1, \OFF1, r10 + cmp/eq \TMP2, r9 + + mov #-1, r10 + addc r11, r10 + + mov.w @r3+, \TMP1 + and r5, r10 + + cmp/eq \TMP1, r9 +3: mov.wv \SRC2, \OFF2, r10 + + mov.w r8, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r14 + mov.l @r15+, r13 + mov.l @r15+, r12 + mov.l @r15+, r10 + mov.l @r15+, r11 + EPILOGUE +.endm + +#ifndef AZRP_RGB16_DYE + +_azrp_image_shader_rgb16_clearbg: + tst #1, r0 + bf 9f + + GEN_CLEARBG_DYE_LOOP 0, 4, r0, r13, 2, 0, r0, r13 +9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r0, 0, 2, r13, r0 + +#endif diff --git a/azur/src/gint/shaders/image_rgb16_dye.S b/azur/src/gint/shaders/image_rgb16_dye.S new file mode 100644 index 0000000..bbc9725 --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_dye.S @@ -0,0 +1,12 @@ +.global _azrp_image_shader_rgb16_dye +#define AZRP_RGB16_DYE +#include "image_rgb16_clearbg.S" + +/* See image_rgb16_clearbg.S for details on this function. */ + +_azrp_image_shader_rgb16_dye: + tst #1, r0 + bf 9f + + GEN_CLEARBG_DYE_LOOP 0, 4, r14, r13, 2, 0, r0, r0 +9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r14, 0, 2, r0, r0 diff --git a/azur/src/gint/shaders/image_rgb16_dye.c b/azur/src/gint/shaders/image_rgb16_dye.c new file mode 100644 index 0000000..d1572a1 --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_dye.c @@ -0,0 +1,26 @@ +#include + +void azrp_image_rgb16_dye(int x, int y, image_t const *img, int eff, + int dye_color) +{ + azrp_subimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff, + dye_color); +} + +void azrp_subimage_rgb16_dye(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int dye_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 12; + cmd.color_1 = img->alpha; + cmd.color_2 = dye_color; + cmd.loop = azrp_image_shader_rgb16_dye; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} diff --git a/azur/src/gint/shaders/image_rgb16_effect.c b/azur/src/gint/shaders/image_rgb16_effect.c new file mode 100644 index 0000000..ec12669 --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_effect.c @@ -0,0 +1,31 @@ +#include + +void azrp_subimage_rgb16_effect(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, ...) +{ + va_list args; + va_start(args, eff); + + if(eff & IMAGE_CLEARBG) { + int bg = va_arg(args, int); + azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_SWAPCOLOR) { + int c1 = va_arg(args, int); + int c2 = va_arg(args, int); + azrp_subimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff, c1, c2); + } + else if(eff & IMAGE_ADDBG) { + int bg = va_arg(args, int); + azrp_subimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg); + } + else if(eff & IMAGE_DYE) { + int dye = va_arg(args, int); + azrp_subimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye); + } + else { + azrp_subimage_rgb16(x, y, img, left, top, w, h, eff); + } + + va_end(args); +} diff --git a/azur/src/gint/shaders/image_rgb16_normal.S b/azur/src/gint/shaders/image_rgb16_normal.S new file mode 100644 index 0000000..6baa0cb --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_normal.S @@ -0,0 +1,124 @@ +.global _azrp_image_shader_rgb16_normal +#include "image_macros.S" + +/* RGB16 Opaque rendering, Azur version: by straightforward copy. + + This function of the image renderer is designed for Azur's streaming model + only. Unlike its RAM-model counterpart which is bottlenecked by its writing + speed, this function is entirely limited by the CPU's ability to output the + data in the required format. + + In the simple case where there is no color effect and no HFLIP, the task of + rendering a 16-bit opaque image boils down to a 2-dimensional memcpy. This + task can be optimized by moving longwords if the source and destination and + co-4-aligned, with four variations depending on the width and initial + position, identified by the following parameters: + + * w1 / w2 denotes the parity of the command width; + * o2 / o4 denotes the alignment of the output. + + It is easy to see that when input and output are not co-aligned, any attempt + to combine two word reads into a single long write requires at least 3 + cycles per 2 pixels and needs parallelism over several pixels to not get + immediately shut down by the LS-to-EX delay. Here we decide to naively copy + by words, which achieves 4 cycles per 2 pixels, mainly because large RGB16 + images are very quickly bottlenecked in reading by their own size anyway. + + The HFLIP version also needs to rearrange pixels, and is thus performed with + word-based copies in all situations, which is a straightforward process. */ + +_azrp_image_shader_rgb16_normal: + /* Not a single cycle */ + tst #1, r0 + bf _BACKWARD_WORD_COPY + + mov #8, r0 /* Use the naive method for width ≤ 8 */ + cmp/ge r2, r0 + + bt.s _FORWARD_WORD_COPY + nop + + mov r5, r0 /* Check if r3 and r5 are co-aligned */ + xor r3, r0 + + /* Not a single cycle */ + tst #2, r0 + bt _FORWARD_LONG_COPY + +_FORWARD_WORD_COPY: + START +2: movs.w @r3+, x0 +3: movs.w x0, @r5+ + END + EPILOGUE + +_FORWARD_LONG_COPY: + shlr r2 /* Test width parity */ + mov #2, r0 + + bt .w1 + nop + +.w2: tst r0, r3 /* Test alignment of input */ + bf .w2d2 + +.w2d4: START +2: movs.l @r3+, x0 +3: movs.l x0, @r5+ + END + EPILOGUE + +.w2d2: add #-1, r2 + nop + + START + movs.w @r3+, x0 + movs.w x0, @r5+ + +2: movs.l @r3+, x0 +3: movs.l x0, @r5+ + + movs.w @r3+, x0 + movs.w x0, @r5+ + END + EPILOGUE + +.w1: tst r0, r3 /* Test alignment of input */ + bf .w1d2 + +.w1d4: START +2: movs.l @r3+, x0 +3: movs.l x0, @r5+ + + movs.w @r3+, x0 + movs.w x0, @r5+ + END + EPILOGUE + +.w1d2: START + movs.w @r3+, x0 + movs.w x0, @r5+ + +2: movs.l @r3+, x0 +3: movs.l x0, @r5+ + END + EPILOGUE + +_BACKWARD_WORD_COPY: + mov r2, r0 + shll r0 + + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + + START +2: movs.w @r3+, x0 +3: movs.w x0, @-r5 + END + EPILOGUE diff --git a/azur/src/gint/shaders/image_rgb16_swapcolor.S b/azur/src/gint/shaders/image_rgb16_swapcolor.S new file mode 100644 index 0000000..b36aa3f --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_swapcolor.S @@ -0,0 +1,116 @@ +.global _azrp_image_shader_rgb16_swapcolor +#include "image_macros.S" + +/* RGB16 SWAPCOLOR, Azur version: by branchless xor selection. + + The xor selection is explained in gint's version of P8 SWAPCOLOR. This + version's selection is slightly simpler because we don't have to index the + palette to find the source color. We use a 2-unrolled 2-stage-pipeline loop + to optimize for CPU speed. + + r7: Right edge pointer + r8: Right edge value + r9: cmd.color_1 + r10: Holds (x ^ y) & -(c == x) during selection + r11: cmd.color_1 ^ cmd.color_2 (ie. x ^ y) + r12: Right edge stride + r13: [temporary] */ + +.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 + mov.w @r8+, r7 /* cmd.edge_2 */ + shlr r2 + + mov.l r11, @-r15 + add #-2, r4 /* Input stride compensation for pipelining */ + + mov.w @r8+, r9 /* cmd.color_1 */ + shll r7 + + mov.l r10, @-r15 + add r5, r7 + + mov.l r12, @-r15 + add #-2, r5 /* Predecrement, see output logic */ + + mov.w @r8+, r11 /* cmd.color_2 */ + mov r2, r12 + + mov.l r13, @-r15 + shll2 r12 + + add r6, r12 + nop + + xor r9, r11 + nop + + .if \HFLIP + mov r2, r0 + shll2 r0 + + add r0, r5 + nop + + shll r0 + nop + + add r0, r6 + nop + .endif + + START + + mov.w @r3+, \TMP1 + nop + + mov.w @r7, r8 /* Save right edge */ + nop + + cmp/eq \TMP1, r9 + nop + +2: subc r10, r10 + nop + + and r11, r10 + mov.w @r3+, \TMP2 + + xor r10, \TMP1 + nop + + mov.wv \TMP1 \OFF1 r5 + cmp/eq \TMP2, r9 + + add #\OUT_DIR, r5 + nop + + subc r10, r10 + nop + + and r11, r10 + mov.w @r3+, \TMP1 + + xor r10, \TMP2 + nop + + cmp/eq \TMP1, r9 +3: mov.wv \TMP2 \OFF2 r5 + + mov.w r8, @r7 /* Restore right edge */ + add r12, r7 + + END + + mov.l @r15+, r13 + mov.l @r15+, r12 + mov.l @r15+, r10 + mov.l @r15+, r11 + EPILOGUE +.endm + +_azrp_image_shader_rgb16_swapcolor: + tst #1, r0 + bf 9f + + GEN_SWAPCOLOR_LOOP 0, 4, r0, r13, 2, 0 +9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r0, 0, 2 diff --git a/azur/src/gint/shaders/image_rgb16_swapcolor.c b/azur/src/gint/shaders/image_rgb16_swapcolor.c new file mode 100644 index 0000000..57e907a --- /dev/null +++ b/azur/src/gint/shaders/image_rgb16_swapcolor.c @@ -0,0 +1,51 @@ +#include + +void azrp_image_rgb16_swapcolor(int x, int y, image_t const *img, int eff, + int old_color, int new_color) +{ + azrp_subimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height, + eff, old_color, new_color); +} + +void azrp_subimage_rgb16_swapcolor(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int old_color, int new_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = old_color; + cmd.color_2 = new_color; + cmd.loop = azrp_image_shader_rgb16_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +} + +void azrp_image_rgb16_addbg(int x, int y, image_t const *img, int eff, + int bg_color) +{ + azrp_subimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height, + eff, bg_color); +} + +void azrp_subimage_rgb16_addbg(int x, int y, image_t const *img, + int left, int top, int w, int h, int eff, int bg_color) +{ + prof_enter(azrp_perf_cmdgen); + struct gint_image_box box = { x, y, w, h, left, top }; + struct gint_image_cmd cmd; + + if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width, + azrp_height)) { + cmd.effect += 8; + cmd.color_1 = img->alpha; + cmd.color_2 = bg_color; + cmd.loop = azrp_image_shader_rgb16_swapcolor; + azrp_queue_image(&box, img, &cmd); + } + prof_leave(azrp_perf_cmdgen); +}