azur: image shader with dynamic effects, and 16-row fragment

This commit is contained in:
Lephe 2022-05-07 18:17:33 +01:00 committed by Lephenixnoir
parent e124719de3
commit 8ac9ac747a
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
27 changed files with 1599 additions and 877 deletions

View File

@ -28,10 +28,32 @@ endif()
if(AZUR_GRAPHICS_GINT_CG)
list(APPEND SOURCES
src/gint/render.c
src/gint/r61524.s
# Clear shader
src/gint/shaders/clear.c
src/gint/shaders/clear.S
# Image shader
src/gint/shaders/image.c
src/gint/shaders/image.S)
src/gint/shaders/image_rgb16_normal.S
src/gint/shaders/image_rgb16_clearbg.S
src/gint/shaders/image_rgb16_swapcolor.S
src/gint/shaders/image_rgb16_dye.S
src/gint/shaders/image_p8_normal.S
src/gint/shaders/image_p8_swapcolor.S
src/gint/shaders/image_p4_normal.S
# Image shader interface
src/gint/shaders/image_rgb16.c
src/gint/shaders/image_rgb16_effect.c
src/gint/shaders/image_rgb16_swapcolor.c
src/gint/shaders/image_rgb16_dye.c
src/gint/shaders/image_p8.c
src/gint/shaders/image_p8_effect.c
src/gint/shaders/image_p8_swapcolor.c
src/gint/shaders/image_p8_dye.c
src/gint/shaders/image_p4.c
src/gint/shaders/image_p4_effect.c
src/gint/shaders/image_p4_swapcolor.c
src/gint/shaders/image_p4_dye.c)
endif()
add_library(azur STATIC ${SOURCES})

View File

@ -1,5 +1,5 @@
//---
// azur.defs: Generation definitions
// azur.defs: General definitions that are included in every file
//---
/* This exposes compile-time configuration symbols. I don't like running the

View File

@ -33,8 +33,8 @@
#include <azur/defs.h>
AZUR_BEGIN_DECLS
#include <gint/defs/types.h>
#include <gint/display.h>
#include <gint/image.h>
#include <libprof.h>
@ -45,7 +45,7 @@ AZUR_BEGIN_DECLS
typedef void azrp_shader_t(void *uniforms, void *command, void *fragment);
/* Video memory fragment used as rendering target (in XRAM). */
extern uint16_t azrp_frag[];
extern uint16_t *azrp_frag;
/* Maximum number of commands that can be queued. (This is only one of two
limits, the other being the size of the command data.) */
@ -128,19 +128,19 @@ extern int azrp_frag_height;
The settings on each mode are as follow:
* x1: Display resolution: 396x224
Fragment size: 8 rows (6336 bytes)
Fragment size: 16 rows (12672 bytes)
Number of fragments: 28 (29 if an offset is used)
Total size of graphics data: 177.408 kB
Total size of graphics data: 177'408 bytes
* x2: Display resolution: 198x112
Fragment size: 16 rows (6336 bytes)
Fragment size: 16 rows (6336 bytes) # TODO: increase
Number of fragments 7 (8 if an offset if used)
Total size of graphics data: 44.352 kB
Total size of graphics data: 44'352 bytes
* x3: Display resolution: 132x75 (last row only has 2/3 pixels)
Fragment size: 16 rows (4224 bytes)
Fragment size: 16 rows (4224 bytes) # TODO: increase
Number of fragments: 5 (sometimes 6 if an offset is used)
Total size of graphics data: 19.800 kB
Total size of graphics data: 19'800 bytes
As one would know when playing modern video games, super-resolution is one
of the most useful ways to increase performance. The reduced amount of
@ -167,30 +167,50 @@ void azrp_config_scale(int scale);
@offset Fragment offset along the y-axis (0 ... height of fragment-1). */
void azrp_config_frag_offset(int offset);
//---
// Hooks
//---
/* Hook called before a fragment is sent to the display. The fragment can be
accessed and modified freeely (however, the time spent in the hook is
counted as overhead and only part of [azrp_perf_render]). */
typedef void azrp_hook_prefrag_t(int id, void *fragment, int size);
/* Get or set the prefrag hook. */
azrp_hook_prefrag_t *azrp_hook_get_prefrag(void);
void azrp_hook_set_prefrag(azrp_hook_prefrag_t *);
//---
// Standard shaders
//---
/* Clears the entire output with a single color */
/* Clears the entire output with a single color */
extern uint8_t AZRP_SHADER_CLEAR;
/* Renders RGB565 textures/images */
extern uint8_t AZRP_SHADER_IMAGE;
/* Renders gint images with various dynamic effects */
extern uint8_t AZRP_SHADER_IMAGE_RGB16;
extern uint8_t AZRP_SHADER_IMAGE_P8;
extern uint8_t AZRP_SHADER_IMAGE_P4;
/* azrp_clear(): Clear output [ARZP_SHADER_CLEAR] */
void azrp_clear(uint16_t color);
/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE] */
/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE_*] */
void azrp_image(int x, int y, bopti_image_t const *image);
/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE] */
/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE_*] */
void azrp_subimage(int x, int y, bopti_image_t const *image,
int left, int top, int width, int height, int flags);
/* See below for more detailed image functions. Dynamic effects are provided
with the same naming convention as gint. */
/* Functions to update uniforms for these shaders. You should call them when:
* AZRP_SHADER_CLEAR: Changing super-scaling settings.
* AZRP_SHADER_IMAGE: Changing super-scaling or or fragment offsets. */
* AZRP_SHADER_IMAGE_*: Changing super-scaling or or fragment offsets. */
void azrp_shader_clear_configure(void);
void azrp_shader_image_configure(void);
void azrp_shader_image_rgb16_configure(void);
void azrp_shader_image_p8_configure(void);
void azrp_shader_image_p4_configure(void);
//---
// Performance indicators
@ -250,32 +270,79 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
exceeded. */
bool azrp_queue_command(void *command, size_t size, int fragment, int count);
/* azrp_queue_image(): Split and queue a gint image command
The command must have been completely prepared with gint_image_mkcmd() and
have had its color effect sections filled. This function sets the shader ID
and adjusts the command for fragmented rendering. */
void azrp_queue_image(struct gint_image_box *box, image_t const *img,
struct gint_image_cmd *cmd);
//---
// Internal shader definitions (for reference; no API guarantee)
// Internal R61524 functions
//---
struct azrp_shader_image_command {
uint8_t shader_id;
/* First edge-preserved pixel offset (P4 only) */
int8_t edge1;
/* Pixels per line */
int16_t columns;
/* Address of the image structure */
bopti_image_t const *image;
/* Destination in XRAM (offset) */
uint16_t output;
/* Number of lines */
int16_t lines;
/* Already offset by start row and column */
void const *input;
void azrp_r61524_fragment_x1(void *fragment, int size);
/* Info for structure update between fragments: */
int16_t height;
int16_t row_stride;
int16_t x;
void azrp_r61524_fragment_x2(void *fragment, int width, int height);
/* Second edge-preserved pixel offset (P4 only) */
int16_t edge2;
};
//---
// Internal functions for the image shader
//
// We use gint's image rendering API but replace some of the core loops with
// Azur-specific versions that are faster in the CPU-bound context of this
// rendering engine. Some of the main loops from Azur actually perform better
// in RAM than bopti used to do, and are already in gint.
//---
/* azrp_image_effect(): Generalized azrp_image() with dynamic effects */
#define azrp_image_effect(x, y, img, eff, ...) \
azrp_image_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
##__VA_ARGS__)
/* azrp_subimage_effect(): Generalized azrp_subimage() with dynamic effects */
void azrp_subimage_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int effects, ...);
/* Specific versions for each format */
#define AZRP_IMAGE_SIG1(NAME, ...) \
void azrp_image_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
void azrp_subimage_ ## NAME(int x, int y, image_t const *img, \
int left, int top, int w, int h, ##__VA_ARGS__);
#define AZRP_IMAGE_SIG(NAME, ...) \
AZRP_IMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
AZRP_IMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
AZRP_IMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
AZRP_IMAGE_SIG(_effect, int effects, ...)
AZRP_IMAGE_SIG(, int effects)
AZRP_IMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
AZRP_IMAGE_SIG(_swapcolor, int effects, int source, int replacement)
AZRP_IMAGE_SIG(_addbg, int effects, int bg_color)
AZRP_IMAGE_SIG(_dye, int effects, int dye_color)
#define azrp_image_rgb16_effect(x, y, img, eff, ...) \
azrp_subimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#define azrp_image_p8_effect(x, y, img, eff, ...) \
azrp_subimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#define azrp_image_p4_effect(x, y, img, eff, ...) \
azrp_subimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
eff, ##__VA_ARGS__)
#undef AZRP_IMAGE_SIG
#undef AZRP_IMAGE_SIG1
/* Main loop provided by Azur; as usual, these are not real functions; their
only use is as the [.loop] field of a command. */
void azrp_image_shader_rgb16_normal(void);
void azrp_image_shader_rgb16_clearbg(void);
void azrp_image_shader_rgb16_swapcolor(void);
void azrp_image_shader_rgb16_dye(void);
void azrp_image_shader_p8_normal(void);
void azrp_image_shader_p8_swapcolor(void);
void azrp_image_shader_p4_normal(void);
void azrp_image_shader_p4_clearbg(void);
AZUR_END_DECLS

65
azur/src/gint/r61524.s Normal file
View File

@ -0,0 +1,65 @@
.section .ilram, "ax"
.balign 4
.global _azrp_r61524_fragment_x1
_azrp_r61524_fragment_x1:
mov.l .R61524_DATA, r2
shlr r5
ldrs 1f
ldre 2f
ldrc r5
nop
/* Read a word from XRAM */
1: mov.l @r4+, r0
/* Write that word to the display */
2: mov.l r0, @r2
rts
nop
.balign 4
.global _azrp_r61524_fragment_x2
_azrp_r61524_fragment_x2:
mov.l .R61524_DATA, r2
nop
/* Read a word, write it twice */
ldrs 1f
ldre 2f
ldrc r5
nop
1: mov.w @r4+, r0
nop
mov.w r0, @r2
nop
mov.w r0, @r2
2: nop
sub r5, r4
sub r5, r4
/* Do that again on a second line */
ldrs 3f
ldre 4f
ldrc r5
nop
3: mov.w @r4+, r0
nop
mov.w r0, @r2
nop
mov.w r0, @r2
4: nop
dt r6
bf _azrp_r61524_fragment_x2
rts
nop
.balign 4
.R61524_DATA:
.long 0xb4000000

View File

@ -7,11 +7,8 @@
#include <string.h>
#include <stdlib.h>
#define YRAM ((void *)0xe5017000)
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
TODO: Extend this to 16 rows, and move the rest to RAM */
GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
/* 16 rows of video memory, occupying 12736/16384 bytes or XYRAM (77.7%). */
uint16_t *azrp_frag = (void *)0xe500e000 + 32;
/* Super-scaling factor, width and height of output. */
int azrp_scale;
@ -22,27 +19,33 @@ int azrp_frag_count;
/* Height of fragment. */
int azrp_frag_height;
/* TODO: Either make command queue private or use azrp_ prefix */
/* Number and total size of queued commands. */
GXRAM int commands_count = 0, commands_length = 0;
static int commands_count=0, commands_length=0;
/* Array of pointers to queued commands (stored as an offset into YRAM). */
GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
/* Array of pointers to queued commands. Each command has:
* Top 16 bits: fragment number
* Bottom 16 bits: offset into command data buffer
Rendering order is integer order. */
static uint32_t commands_array[AZRP_MAX_COMMANDS];
static GALIGNED(4) uint8_t commands_data[8192];
/* Array of shader programs and uniforms. */
GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
/* Next free index in the shader program array. */
GXRAM static uint16_t shaders_next = 0;
static uint16_t shaders_next = 0;
/* Hooks. */
static azrp_hook_prefrag_t *azrp_hook_prefrag = NULL;
/* Performance counters. */
GXRAM prof_t azrp_perf_cmdgen;
GXRAM prof_t azrp_perf_sort;
GXRAM prof_t azrp_perf_shaders;
GXRAM prof_t azrp_perf_r61524;
GXRAM prof_t azrp_perf_render;
prof_t azrp_perf_cmdgen;
prof_t azrp_perf_sort;
prof_t azrp_perf_shaders;
prof_t azrp_perf_r61524;
prof_t azrp_perf_render;
//---
// High and low-level pipeline functions
@ -110,25 +113,23 @@ void azrp_render_fragments(void)
while(1) {
while(cmd < next_frag_threshold && i < commands_count) {
azrp_commands_total++;
uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
uint8_t *data = commands_data + (cmd & 0xffff);
prof_enter_norec(azrp_perf_shaders);
shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
prof_leave_norec(azrp_perf_shaders);
if(data[0] == AZRP_SHADER_IMAGE) {
struct azrp_shader_image_command *cmd = (void *)data;
cmd->height -= cmd->lines;
cmd->input += cmd->row_stride * cmd->lines;
cmd->lines = min(cmd->height, azrp_frag_height);
cmd->output = 2 * cmd->x;
}
cmd = commands_array[++i];
}
/* TODO: Consider xram_frame() by DMA in parallel? */
if(azrp_hook_prefrag) {
int size = azrp_width * azrp_frag_height * 2;
(*azrp_hook_prefrag)(frag, azrp_frag, size);
}
prof_enter_norec(azrp_perf_r61524);
xram_frame(azrp_frag, 396 * 8);
if(azrp_scale == 1)
azrp_r61524_fragment_x1(azrp_frag, 396 * azrp_frag_height);
else if(azrp_scale == 2)
azrp_r61524_fragment_x2(azrp_frag, azrp_width, azrp_frag_height);
prof_leave_norec(azrp_perf_r61524);
if(++frag >= azrp_frag_count) break;
@ -149,10 +150,12 @@ void azrp_update(void)
// Configuration calls
//---
// TODO: Use larger fragments in upscales x2 and x3
static void update_frag_count(void)
{
if(azrp_scale == 1)
azrp_frag_count = 28 + (azrp_frag_offset > 0);
azrp_frag_count = 14 + (azrp_frag_offset > 0);
else if(azrp_scale == 2)
azrp_frag_count = 7 + (azrp_frag_offset > 0);
else if(azrp_scale == 3)
@ -162,7 +165,7 @@ static void update_frag_count(void)
static void update_size(void)
{
if(azrp_scale == 1)
azrp_width = 396, azrp_height = 198, azrp_frag_height = 8;
azrp_width = 396, azrp_height = 224, azrp_frag_height = 16;
else if(azrp_scale == 2)
azrp_width = 198, azrp_height = 112, azrp_frag_height = 16;
else if(azrp_scale == 3)
@ -194,6 +197,20 @@ static void default_settings(void)
azrp_config_scale(1);
}
//---
// Hooks
//---
azrp_hook_prefrag_t *azrp_hook_get_prefrag(void)
{
return azrp_hook_prefrag;
}
void azrp_hook_set_prefrag(azrp_hook_prefrag_t *hook)
{
azrp_hook_prefrag = hook;
}
//---
// Custom shaders
//---
@ -226,7 +243,7 @@ bool azrp_queue_command(void *command, size_t size, int fragment, int count)
if(commands_length + size >= 8192)
return false;
uint8_t *dst = YRAM + commands_length;
uint8_t *dst = commands_data + commands_length;
uint8_t *src = command;
for(size_t i = 0; i < size; i++)

View File

@ -1,727 +0,0 @@
/* Azur's built-in shaders: <image>
If there ever was a fantastic piece of assembler engineering in my work up
to this point, this would be it. Every trick in the book is used here, from
clever instruction combinations, pipeline flow and tricky DSP abuse all the
way up to memory layout planning, transforms on loop structures, and most
critically superscalar parallelism.
While the performance of the shader is not *strictly* proportional to the
speed of the tightest loop, it's very close. The use of operand-bus XRAM for
graphics data, systematic alignment, and detailed pipeline stalling
measurements for common instruction sequences in gintctl allow very accurate
speed predictions to be made based on the tightness of the code.
The palette formats of bopti have been refined for the purpose of this
shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
and P4 being renamed P4_RGB565A with minimal changes along with a variation
aptly named P4_RGB565.
The asymptotic performance for each format is as follows:
* RGB565: 1 cycle/pixel if source and destination align
2 cycles/pixel otherwise
* RGB565A: 4 cycles/pixel
* P8_RGB565A: 4.5 cycles/pixel
* P8_RGB565: 3 cycles/pixel
* P4_RGB565A: 5 cycles/pixel
* P4_RGB565: 3.5 cycles/pixel
Entirely documenting this code would take me hours, but some elements are
provided in the comments. Superscalar parallelism is most easily appreciated
by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
structural technique at play in this code is loop transforms.
Basically, a loop that loads a pixel, performs computations with it, and
writes the result is inefficient because of the RAW dependencies on most
operations (with full stall cycles between loads and computations, and
between computations and uses as addresses). Well-established loop
optimization literature has lots of techniques to help with this problem,
and I use two here:
* _Pipelining_ the loop consists in handling a single pixel over several
iterations by doing a little bit of work in each iteration. The data for
the pixel would move from register to register at each iteration, with the
loop code doing one stage's worth of computation on each register. (You
can view it as a diagonal iteration pattern in the pixel*instruction grid
if you like such visualizations.)
By increasing the number of pixels in the pipeline, a lot of independent
data can be obtained, reducing dependency pressure and allowing for
greater parallelism at the cost of more registers being used.
The use of pipelining in this shader is very modest, with 2 stages at
most, and usually only a couple of instructions being performed in advance
for the next pixel while the current one finishes processing. Register
assignments have some subtleties though since pressure is high overall.
* _Unrolling_ iterations of the loop consists in loading two (or more)
pixels at the start of each iteration so that we can work on one while
waiting for stalls and dependencies on the other.
Unlike pipelining, a loop iteration starts and ends with full pixels and
no work carries between iterations. Unrolling allows different pixels to
use different registers and generally better optimize the instruction
sequence, at the cost of only supporting pixel counts that are multipes of
the unrolling level.
Handling non-multiple sizes is the everlasting bane of unrolled loops,
sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
to only handle even sizes and neutralize unwanted pixels after the fact.
Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
almost all formats (except RGB556A which performs DSP trickery).
*/
.global _azrp_shader_image
.align 4
/* Register assignment
r0: (temporary)
r1: Lines
r2: Command queue; (temporary)
r3: Input
r4: [parameter] azrp_width*2; output stride
r5: [parameter] Command queue; Output
r6: [parameter] azrp_frag; alpha value; (temporary)
r7: Columns
r8: Image pointer; (temporary)
r9: Input stride */
_azrp_shader_image:
mov.l r8, @-r15
add #2, r5
mov.l r9, @-r15
mov r5, r2
mov.w @r2+, r7 /* command.columns */
mov.l @r2+, r8 /* command.image */
mov.w @r2+, r5 /* command.output (offset) */
sub r7, r4
mov.w @r8+, r9 /* image.profile */
sub r7, r4
mov.w @r2+, r1 /* command.lines */
add r6, r5
mov.l @r2+, r3 /* command.input (pointer) */
shll2 r9
mova .formats, r0
mov.w @r8+, r6 /* image.alpha */
mov.l @(r0,r9), r0
mov.w @r8+, r9 /* image.width */
jmp @r0
nop
.align 4
.formats:
.long _RGB565
.long _RGB565A
.long _NOP /* P8 */
.long _P4_RGB565A /* =P4 */
.long _P8_RGB565
.long _P8_RGB565A
.long _P4_RGB565
/* [Loop macros]
The following macros implement the main loop of the image renderer.
* Each line is rendered in the tight loop between 2: and 3: (both included).
* r5 is the output (with stride r4, in bytes)
* r3 is the input (with stride r9, in bytes)
* There are r1 rows with r7 iterations each */
#define START() \
nop; /* 4-alignment */ \
ldrs 2f; \
ldre 3f; \
1: ldrc r7
#define END_NORET() \
dt r1; \
add r4, r5; \
bf.s 1b; \
add r9, r3
#define END() \
END_NORET(); \
mov.l @r15+, r9; \
rts; \
mov.l @r15+, r8
/* [Rendering strategy for the RGB565 format]
In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
optimize by moving longwords. Since longwords are pairs of pixels, there are
variations and subcases based on the parity of each parameter:
* w[eo] denotes whether the width of the image is even or odd;
* d[eo] denotes whether the memory accesses to the source and destination
are even (4-aligned) or odd (2-aligned).
When the destination and source have identical parity, the d[eo] variation
can be defined. In this case the copy is pretty direct, it's a longword copy
and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
start or end address is 2-aligned.
However, when they have opposite parity, each longword read matches up with
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
not help because of the stall cycle between loading a register and using it
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
the word-based copy). Unrolling iterations could help but would be too
complex here (adding sub-cases); a super-heavy renderer with more hypotheses
(like a tileset shader) should aim for that route though. Also, movua.l
followed by mov.l is even slower (5 cycles). */
.align 4
_RGB565:
mov #8, r0 /* Maximum width for naive method */
sub r7, r9
cmp/ge r7, r0
shll r9
bt.s _RGB565.naive
mov #2, r0
/* Use naive method for opposite source/destination parity */
mov r5, r6
xor r3, r6
tst r0, r6
bf _RGB565.naive
shlr r7
bt _RGB565.wo
_RGB565.we:
tst r0, r5
bf _RGB565.we_do
/* This is 4-aligned */
_RGB565.we_de:
START()
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END()
.align 4
_RGB565.we_do:
add #-1, r7
START()
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END()
.align 4
_RGB565.wo:
tst r0, r5
bf _RGB565.wo_do
_RGB565.wo_de:
START()
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END()
.align 4
_RGB565.wo_do:
START()
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END()
/* Naive method for small widths and opposite source/destination parity */
.align 4
_RGB565.naive:
START()
2: movs.w @r3+, x0
3: movs.w x0, @r5+
END()
/* [Rendering strategy for the RGB565A format]
Since we have to check for the alpha value in each pixel, there's really no
longword-based optimization. Instead, we just go as fast as possible with
each pixel, using DSP instructions because conditional execution is pretty
damn good. This takes 4 cycles/pixel. I tried a number of reductions to
3 cycles/pixel but could not get any of them to work. */
.align 4
_RGB565A:
shll16 r6
mov #0x0004, r0 /* DC Zero mode */
sub r7, r9
shll r9
lds r6, y0
lds r0, dsr
START()
2: movs.w @r3+, x0
pcmp x0, y0 movx.w @r5, x1
dct pcopy x1, x0
3: movx.w x0, @r5+
END()
/* [Rendering strategy for the P8_RGB565A format]
The work needed for each pixel gets more difficult as we go, with alpha
being the major culprit due to its additional comparisons, jumps, and
limited optimization opportunities when unrolling due to conditionally-
executed code.
Because arithmetic is unavoidable and there are 1-cycle delays between both
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
iterations with a 2-stage pipeline structure. This fills the stall cycles
and increases parallelism significantly. Pure loop optimization handbook.
Dealing with odd widths is a major pain as usual. Instead of adding logic to
handle the extra pixel separately, this routine lets the loop overwrite it,
then restores its original value afterwards - a delightfully elegant trick.
The P8 format is actually so bad that spending precious time grinding cycles
felt completely inappropriate without first refining it. This led to two new
variations, P8_RGB565 and P8_RGB565A, which fix the following problems.
-> First there is alpha for all images, which is the most costly feature,
single-handedly accounting for half of the work per pixel. P8_RGB565
does no support alpha, which basically doubles performance.
-> Then, there is the alpha value itself. In P8 it is a variable (and fxconv
sets it to 0xff), which burns a register for the comparison and enforces
a fixed order between comparison and left-shift. P8_RGB565A always sets
an alpha value of 0x00 which lifts both constraints.
-> Then, there are palette indices. In P8 they are unsigned, which requires
an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign-
extended value of the mov.b can be used directly (once doubled). The
palette base is simply offset by 128 entries, with colors numbered
-128..-1 first and only then 0..127.
-> Finally, there's the palette itself. In P8 it always has 256 entries,
even when only a few are used. For small images this is a huge waste, so
P8_RGB565 and P8_RGB565A only store colors that are actually used.
P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good
compared to 4 cycles/pixel for RGB565A. */
.align 4
_P8_RGB565A:
mov.l r13, @-r15
sub r7, r9
mov r7, r13
add #-2, r9 /* Input stride compensation for pipelining */
mov.l r12, @-r15
shlr r7
mov.l r10, @-r15
movt r6
mov.w _P8_RGB565A.palette_distance, r0
shll r13
add r6, r7
sub r6, r9
sub r6, r4
sub r6, r4
add r0, r8
add r5, r13
mov r7, r2
add #-4, r5 /* Output offset compensation in the loop */
shll2 r2
add r4, r2
START()
mov.b @r3+, r6
/* Save next pixel for the odd-width case */
mov.w @r13, r12
mov.b @r3+, r10
tst r6, r6
/* 2-unrolled 2-stage main loop */
2: add r6, r6
mov r6, r0
add r10, r10
bt.s 5f
tst r10, r10
mov.w @(r0,r8), r0
mov.w r0, @(4,r5)
5: mov.b @r3+, r6
mov r10, r0
bt.s 6f
add #4, r5
mov.w @(r0,r8), r0
mov.w r0, @(2,r5)
6: mov.b @r3+, r10
3: tst r6, r6
/* Restore last pixel */
mov.w r12, @r13
add r2, r13
END_NORET()
mov.l @r15+, r10
mov.l @r15+, r12
mov.l @r15+, r13
mov.l @r15+, r9
rts
mov.l @r15+, r8
_P8_RGB565A.palette_distance:
/* Distance between image pointer and palette array base */
.word 260
/* [Rendering strategy for the P8_RGB565 format]
See P8_RGB565A for format details. Removing the checks for transparency and
the jumps simplifies the instruction sequence and allows superior
parallelism because all paths are unconditional. This routines achieves
3 cycles/pixel asymptotically. */
.align 4
_P8_RGB565:
mov.l r13, @-r15
sub r7, r9
mov r7, r13
add #-2, r9 /* Input stride compensation for pipelining */
mov.l r12, @-r15
shlr r7
mov.l r10, @-r15
movt r6
mov.w _P8_RGB565.palette_distance, r0
shll r13
add r6, r7
sub r6, r9
sub r6, r4
sub r6, r4
add r0, r8
add r5, r13
add #-4, r5 /* Output offset compensation in the loop */
mov r7, r2
shll2 r2
add r4, r2
START()
mov.b @r3+, r0
/* Save next pixel for the odd-width case */
mov.w @r13, r12
mov.b @r3+, r10
shll r0
/* 2-unrolled 2-stage main loop */
2: mov.b @r3+, r6
shll r10
mov.w @(r0,r8), r0
/* This nop is not for show, it actually prevents the loop from slowing
down to 7 cycles /i, probably due to instruction reads alignment. */
nop
mov.w r0, @(4,r5)
mov r10, r0
mov.b @r3+, r10
add #4, r5
mov.w @(r0,r8), r0
shll r6
mov.w r0, @(2,r5)
3: mov r6, r0
/* Restore last pixel */
mov.w r12, @r13
add r2, r13
END_NORET()
mov.l @r15+, r10
mov.l @r15+, r12
mov.l @r15+, r13
mov.l @r15+, r9
rts
mov.l @r15+, r8
_P8_RGB565.palette_distance:
/* Distance between image pointer and palette array base */
.word 260
/* [Rendering strategy for the P4_RGB565A format]
This is the most complex format. Most of the remarks that apply to
P8_RGB565A also apply here, except that there are less opportunities to save
computation because nibbles must be extracted anyway.
The P4_RGB565A format is simply bopti's P4, but an additional variation
P4_RGB565 is specified to save on transparency handling, which is very
expensive.
The special nature of the nibble packing means the simplest loop form writes
2 pixels from a 2-aligned source image position in a single iteration. Other
structures don't even come close: selecting nibbles individually is folly,
while not unrolling is inefficient. So the whole point of this routine is to
forcibly align the subimage on a byte-aligned and never break that grid.
The command builder for P4 does this alignment before submitting the
command. Obviously the transform can cause one extra pixel to be overridden
on each side of every line. The command is thus extended with two edge
offsets indicating pixels to preserve at each end. When overwrites occurs,
the edge offsets point to the overwritten pixels so they can be restored.
Otherwise, they point to the next pixels and the restores are no-ops. See
the strategy used for managing unrolling in P8 formats for details.
The only irregularity is image width, which the command builder cannot
modify. It is rounded up to the next multiple of 2, then halved. There is a
nice trick for this operation, which is [shlr rX] then adding T to rX. We
also need to add -1 for another adjustement, and both are combined into an
addc, which saves one add and one movt off the EX critical chain.
The main loop achieves 5 cycles/pixel. */
.align 4
_P4_RGB565A:
shlr r7
mov.w @(6, r2), r0 /* command.edge2 */
mov.l r12, @-r15
add #-15, r2 /* Go back to start of command */
mov #-1, r12
shlr r9
mov.l r11, @-r15
addc r12, r9
mov r0, r12
add r12, r12
mov.l r10, @-r15
sub r7, r9
mov.b @r2, r11 /* command.edge1 */
add #2, r8 /* image.palette */
mov.l r13, @-r15
mov r5, r0
mov.l r14, @-r15
shll r11
add #-4, r5
nop /* 4-alignment */
START()
mov.b @r3+, r6
mov r0, r10
mov.w @(r0,r11), r13
mov.w @(r0,r12), r14
shll r6
/* Main loop with 2 pixels sharing a single byte */
2: mov r6, r0
and #0x1e, r0
tst r0, r0
bt.s 4f
shlr2 r6
mov.w @(r0,r8), r0
mov.w r0, @(6,r5)
4: shlr2 r6
mov r6, r0
and #0x1e, r0
tst r0, r0
mov.b @r3+, r6
bt.s 5f
add #4, r5
mov.w @(r0,r8), r0
mov.w r0, @r5
3: 5: shll r6
mov r10, r0
mov r7, r10
shll2 r10
mov.w r13, @(r0,r11)
add r4, r10
mov.w r14, @(r0,r12)
add r0, r10
mov r10, r0
/* Parallelizes with [dt r1] expanded from END_NORET() */
END_NORET()
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r10
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r9
rts
mov.l @r15+, r8
/* [Rendering strategy for the P4_RGB565 format]
Same as P4_RGB565A without transparency checks (fairly straightforward). The
core loop runs in 3.5 cycles/pixel. */
.align 4
_P4_RGB565:
shlr r7
mov.w @(6, r2), r0 /* command.edge2 */
mov.l r10, @-r15
add #-15, r2 /* Go back to start of command */
mov.l r12, @-r15
shlr r9
add #2, r8 /* image.palette */
mov #-1, r12
mov.l r11, @-r15
addc r12, r9
mov r0, r12
add r12, r12
mov.b @r2, r11 /* command.edge1 */
sub r7, r9
mov.l r13, @-r15
mov #0x1e, r2
mov.l r14, @-r15
shll r11
mov r5, r0
add #-4, r5
START()
mov.b @r3+, r6
mov #-4, r10
mov.l r0, @-r15
mov.w @(r0,r11), r13
mov.w @(r0,r12), r14
shll r6
/* Main loop with 2 pixels sharing a single byte */
2: mov r6, r0
and #0x1e, r0
shld r10, r6
mov.w @(r0,r8), r0
and r2, r6
mov.w r0, @(6,r5)
mov r6, r0
mov.b @r3+, r6
add #4, r5
mov.w @(r0,r8), r0
mov.w r0, @r5
3: shll r6
mov.l @r15+, r0
mov r7, r10
shll2 r10
mov.w r13, @(r0,r11)
add r4, r10
mov.w r14, @(r0,r12)
add r0, r10
mov r10, r0
/* Parallelizes with [dt r1] expanded from END_NORET() */
END_NORET()
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
/* [Unsupported formats]
P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
_NOP:
mov.l @r15+, r9
rts
mov.l @r15+, r8

View File

@ -1,88 +1,45 @@
#include <azur/gint/render.h>
#include <gint/defs/util.h>
uint8_t AZRP_SHADER_IMAGE = -1;
__attribute__((constructor))
static void register_shader(void)
void azrp_queue_image(struct gint_image_box *box, image_t const *img,
struct gint_image_cmd *cmd)
{
extern azrp_shader_t azrp_shader_image;
AZRP_SHADER_IMAGE = azrp_register_shader(azrp_shader_image);
}
void azrp_shader_image_configure(void)
{
azrp_set_uniforms(AZRP_SHADER_IMAGE, (void *)(2 * azrp_width));
}
//---
/* Profile IDs */
#define RGB565 0
#define RGB565A 1
#define P4_RGB565A 3
#define P8_RGB565 4
#define P8_RGB565A 5
#define P4_RGB565 6
void azrp_image(int x, int y, bopti_image_t const *image)
{
azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0);
}
void azrp_subimage(int x, int y, bopti_image_t const *image,
int left, int top, int width, int height, int flags)
{
prof_enter(azrp_perf_cmdgen);
if(!(flags & DIMAGE_NOCLIP)) {
/* TODO: image: clip function */
}
struct azrp_shader_image_command cmd;
cmd.shader_id = AZRP_SHADER_IMAGE;
cmd.columns = width;
cmd.image = image;
int row_stride;
if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
row_stride = image->width;
cmd.input = (void *)image->data + (image->data[0] * 2) + 2 +
top * row_stride + left;
}
else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
row_stride = (image->width + 1) >> 1;
cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1);
int odd_left = left & 1;
int odd_right = (left + width) & 1;
cmd.edge1 = -1 + odd_left;
cmd.edge2 = width + odd_left;
cmd.columns += odd_left + odd_right;
x -= odd_left;
}
else {
row_stride = image->width << 1;
cmd.input = (void *)image->data + top * row_stride + (left << 1);
}
/* TODO: Ironically, this loads all 3 entry points */
int p = img->profile;
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
cmd->shader_id = AZRP_SHADER_IMAGE_RGB16;
else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
cmd->shader_id = AZRP_SHADER_IMAGE_P8;
else
cmd->shader_id = AZRP_SHADER_IMAGE_P4;
/* This divides by azrp_frag_height */
int fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
/* TODO: Have a proper way to do optimized-division by azrp_frag_height */
int fragment_id = (azrp_scale == 1) ? (box->y >> 4) : (box->y >> 4);
/* These settings only apply to the first fragment */
int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
cmd.lines = azrp_frag_height - first_y;
cmd.output = 2 * (azrp_width * first_y + x);
int first_y = (box->y + azrp_frag_offset) & (azrp_frag_height - 1);
cmd->lines = min(box->h, azrp_frag_height - first_y);
cmd->output = (void *)azrp_frag + (azrp_width * first_y + cmd->x) * 2;
/* Settings for further updates */
cmd.height = height;
cmd.row_stride = row_stride;
cmd.x = x;
int n = 1 + (height - cmd.lines + azrp_frag_height - 1) / azrp_frag_height;
azrp_queue_command(&cmd, sizeof cmd, fragment_id, n);
prof_leave(azrp_perf_cmdgen);
int n = 1 + (box->h - cmd->lines + azrp_frag_height-1) / azrp_frag_height;
azrp_queue_command(cmd, sizeof *cmd, fragment_id, n);
}
void azrp_subimage(int x, int y, image_t const *img,
int left, int top, int width, int height, int flags)
{
int p = img->profile;
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
return azrp_subimage_rgb16(x, y, img, left, top, width, height, flags);
if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
return azrp_subimage_p8(x, y, img, left, top, width, height, flags);
if(p == IMAGE_P4_RGB565 || p == IMAGE_P4_RGB565A)
return azrp_subimage_p4(x, y, img, left, top, width, height, flags);
}
void azrp_image(int x, int y, image_t const *img)
{
azrp_subimage(x, y, img, 0, 0, img->width, img->height, 0);
}

View File

@ -0,0 +1,37 @@
/* mov.wv: Move at a variable offset. This macro is functionally identical to
mov.w \SRC, @(\OFF, \DST)
except that when OFF=0 it simplifies into [mov.w \SRC, @\DST] so that SRC is
not constrained to be r0. */
.macro mov.wv SRC, OFF, DST
.if (\OFF == 0)
mov.w \SRC, @\DST
.else
mov.w \SRC, @(\OFF, \DST)
.endif
.endm
/* START: Sets up the inner and outer loop. The outer loop is anything between
the calls to macros START and END, while the inner loop is the code between
labels 2: and 3: (both *INCLUDED*). */
.macro START
ldrs 2f
ldre 3f
1: ldrc r2
nop
.endm
/* END: Finishes the outer loop and adds strides. */
.macro END
dt r1
add r4, r3
bf.s 1b
add r6, r5
.endm
/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
.macro EPILOGUE
mov.l @r15+, r9
mov r3, r0
rts
mov.l @r15+, r8
.endm

View File

@ -0,0 +1,70 @@
#include <azur/gint/render.h>
#include <gint/defs/util.h>
uint8_t AZRP_SHADER_IMAGE_P4 = -1;
static void shader_p4(void *uniforms, void *command, void *fragment)
{
struct gint_image_cmd *cmd = (void *)command;
cmd->input = gint_image_p4_loop((int)uniforms, cmd);
cmd->height -= cmd->lines;
cmd->lines = min(cmd->height, azrp_frag_height);
cmd->output = fragment + cmd->x * 2;
}
__attribute__((constructor))
static void register_shader(void)
{
AZRP_SHADER_IMAGE_P4 = azrp_register_shader(shader_p4);
}
void azrp_shader_image_p4_configure(void)
{
azrp_set_uniforms(AZRP_SHADER_IMAGE_P4, (void *)azrp_width);
}
void azrp_image_p4(int x, int y, image_t const *img, int eff)
{
azrp_subimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
}
void azrp_subimage_p4(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_P4_RGB565A)
return azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
azrp_height)) {
cmd.loop = azrp_image_shader_p4_normal;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
{
azrp_subimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
}
void azrp_subimage_p4_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = gint_image_p4_clearbg_alt;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,26 @@
#include <azur/gint/render.h>
void azrp_image_p4_dye(int x, int y, image_t const *img, int eff,
int dye_color)
{
azrp_subimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void azrp_subimage_p4_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 4;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = gint_image_p4_dye;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,31 @@
#include <azur/gint/render.h>
void azrp_subimage_p4_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int from = va_arg(args, int);
int to = va_arg(args, int);
azrp_subimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from, to);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
azrp_subimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
azrp_subimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
azrp_subimage_p4(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,119 @@
.global _azrp_image_shader_p4_normal
#include "image_macros.S"
/* P4 Opaque rendering, Azur version: trivial with loop transforms.
This is a pretty direct loop with no difficult tricks involved; it expands
on P8 by adding another edge pointer. The main change is the decoding logic
which now only involves a single byte to load for every two pixels, but more
arithmetic to extract the nibbles.
All the loops in Azur's P4 functions are obvious EX chains and thus any
optimization would need to simplify the arithmetic to gain any half-cycles.
r0: [temporary]
r7: Right edge pointer
r8: Right edge value
r9: Palette
r10: Left edge pointer
r11: Left edge value
r12: Edge stride
r13: [temporary]
r14: [temporary] */
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add r5, r10
mov.l r14, @-r15
add #-4, r5
add #-1, r4 /* Input stride compensation for pipelining */
nop
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
START
mov.b @r3+, \TMP1
mov #-4, \TMP2
mov.w @r7, r8 /* Save right edge */
nop
mov.w @r10, r11 /* Save left edge */
shll \TMP1
2: mov \TMP1, r0
and #0x1e, r0
shld \TMP2, \TMP1
mov #0x1e, \TMP2
mov.w @(r0,r9), r0
and \TMP2, \TMP1
mov.w r0, @(\OFF1,r5)
mov \TMP1, r0
mov.b @r3+, \TMP1
add #\OUT_DIR, r5
mov.w @(r0,r9), r0
mov #-4, \TMP2
mov.w r0, @(\OFF2,r5)
3: shll \TMP1
mov.w r8, @r7 /* Restore right edge */
add r12, r7
mov.w r11, @r10 /* Restore left edge */
add r12, r10
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_azrp_image_shader_p4_normal:
tst #1, r0
bf 9f
GEN_NORMAL_LOOP 0, 4, r13, r14, 6, 0
9: GEN_NORMAL_LOOP 1, -4, r13, r14, 0, 6

View File

@ -0,0 +1,51 @@
#include <azur/gint/render.h>
void azrp_image_p4_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
azrp_subimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void azrp_subimage_p4_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_index, int new_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = old_index;
cmd.color_2 = new_color;
cmd.loop = gint_image_p4_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_p4_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
azrp_subimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void azrp_subimage_p4_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = gint_image_p4_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,71 @@
#include <azur/gint/render.h>
#include <gint/defs/util.h>
uint8_t AZRP_SHADER_IMAGE_P8 = -1;
static void shader_p8(void *uniforms, void *command, void *fragment)
{
struct gint_image_cmd *cmd = (void *)command;
cmd->input = gint_image_p8_loop((int)uniforms, cmd);
cmd->height -= cmd->lines;
cmd->lines = min(cmd->height, azrp_frag_height);
cmd->output = fragment + cmd->x * 2;
}
__attribute__((constructor))
static void register_shader(void)
{
AZRP_SHADER_IMAGE_P8 = azrp_register_shader(shader_p8);
}
void azrp_shader_image_p8_configure(void)
{
azrp_set_uniforms(AZRP_SHADER_IMAGE_P8, (void *)azrp_width);
}
void azrp_image_p8(int x, int y, image_t const *img, int eff)
{
azrp_subimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
}
void azrp_subimage_p8(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_P8_RGB565A)
return azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.loop = azrp_image_shader_p8_normal;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
{
azrp_subimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
bg);
}
void azrp_subimage_p8_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = gint_image_p8_clearbg;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,26 @@
#include <azur/gint/render.h>
void azrp_image_p8_dye(int x, int y, image_t const *img, int eff,
int dye_color)
{
azrp_subimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void azrp_subimage_p8_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 4;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = gint_image_p8_dye;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,31 @@
#include <azur/gint/render.h>
void azrp_subimage_p8_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int from = va_arg(args, int);
int to = va_arg(args, int);
azrp_subimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from, to);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
azrp_subimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
azrp_subimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
azrp_subimage_p8(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,100 @@
.global _azrp_image_shader_p8_normal
#include "image_macros.S"
/* P8 Opaque rendering, Azur version: trivial with loop transforms.
This is fairly straightforward, with no particular tricks; just index the
palette as fast as possible in a 2-unrolled 2-stage-pipeline loop that maxes
out CPU speed.
r0: [temporary]
r7: Right edge pointer
r8: Right edge value
r9: Palette
r10: [temporary]
r11: [temporary]
r12: Right edge stride */
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette */
shlr r2
mov.w @r8+, r7 /* cmd.edge_2 */
mov r2, r0
mov.l r12, @-r15
shll2 r0
mov.l r10, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
add #-4, r5
nop
add #-2, r4 /* Input stride compensation for pipelining */
nop
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
START
mov.b @r3+, r0
nop
mov.w @r7, r8 /* Save right edge */
nop
mov.b @r3+, \TMP1
shll r0
2: mov.b @r3+, \TMP2
shll \TMP1
mov.w @(r0,r9), r0
/* Fun fact: omitting this nop slows the loop to 7 cycles/i */
nop
mov.w r0, @(\OFF1,r5)
mov \TMP1, r0
mov.b @r3+, \TMP1
add #\OUT_DIR, r5
mov.w @(r0,r9), r0
shll \TMP2
mov.w r0, @(\OFF2,r5)
3: mov \TMP2, r0
mov.w r8, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r12
EPILOGUE
.endm
_azrp_image_shader_p8_normal:
tst #1, r0
bf 9f
GEN_NORMAL_LOOP 0, 4, r10, r11, 4, 2
9: GEN_NORMAL_LOOP 1, -4, r10, r11, 2, 4

View File

@ -0,0 +1,142 @@
.global _azrp_image_shader_p8_swapcolor
#include "image_macros.S"
/* P8 SWAPCOLOR, Azur version: by branchless xor selection.
This is essentially the same logic as gint's P8 SWAPCOLOR version, but with
a 2-unrolled 2-stage-pipeline since the bottleneck on RAM is now on the CPU.
r0: [temporary]
r7: Right edge pointer
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
r9: Palette
r10: Holds (x ^ y) & -(c == x) during selection
r11: cmd.color_1
r12: Right edge stride
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-4,r15): Right edge value */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette */
shlr r2
mov.w @r8+, r7 /* cmd.edge_2 */
mov r2, r0
mov.l r12, @-r15
shll2 r0
mov.l r11, @-r15
shll r7
mov.w @r8+, r11 /* cmd.color_1 */
add r5, r7
mov.l r10, @-r15
add #-4, r5
mov.l r13, @-r15
exts.b r11, r11
mov r11, r13
add r13, r13
mov.w @r8, r8 /* cmd.color_2 */
add r9, r13
mov r0, r12
add r6, r12
mov.w @r13, r13
add #-2, r4 /* Input stride compensation for pipelining */
mov.l r14, @-r15
nop
xor r13, r8
nop
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
START
mov.b @r3+, \TMP2
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
cmp/eq \TMP2, r11
mov.b @r3+, \TMP1
add \TMP2, \TMP2
2: subc r10, r10
mov \TMP2, r0
cmp/eq \TMP1, r11
mov.w @(r0, r9), r0
and r8, r10
nop
xor r10, r0
nop
mov.w r0, @(\OFF1, r5)
add #\OUT_DIR, r5
mov.b @r3+, \TMP2
subc r10, r10
add \TMP1, \TMP1
mov \TMP1, r0
mov.w @(r0, r9), r0
cmp/eq \TMP2, r11
mov.b @r3+, \TMP1
and r8, r10
xor r10, r0
nop
mov.w r0, @(\OFF2, r5)
3: add \TMP2, \TMP2
/* TODO: Use x0 as temporary storage by moving the main registers */
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r10
mov.l @r15+, r11
mov.l @r15+, r12
EPILOGUE
.endm
_azrp_image_shader_p8_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 4, 2
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 2, 4

View File

@ -0,0 +1,51 @@
#include <azur/gint/render.h>
void azrp_image_p8_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
azrp_subimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void azrp_subimage_p8_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_index, int new_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = old_index;
cmd.color_2 = new_color;
cmd.loop = azrp_image_shader_p8_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_p8_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
azrp_subimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void azrp_subimage_p8_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = azrp_image_shader_p8_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,71 @@
#include <azur/gint/render.h>
#include <gint/defs/util.h>
uint8_t AZRP_SHADER_IMAGE_RGB16 = -1;
static void shader_rgb16(void *uniforms, void *command, void *fragment)
{
struct gint_image_cmd *cmd = (void *)command;
cmd->input = gint_image_rgb16_loop((int)uniforms, cmd);
cmd->height -= cmd->lines;
cmd->lines = min(cmd->height, azrp_frag_height);
cmd->output = fragment + cmd->x * 2;
}
__attribute__((constructor))
static void register_shader(void)
{
AZRP_SHADER_IMAGE_RGB16 = azrp_register_shader(shader_rgb16);
}
void azrp_shader_image_rgb16_configure(void)
{
azrp_set_uniforms(AZRP_SHADER_IMAGE_RGB16, (void *)azrp_width);
}
void azrp_image_rgb16(int x, int y, image_t const *img, int eff)
{
azrp_subimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
}
void azrp_subimage_rgb16(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff)
{
if(img->profile == IMAGE_RGB565A)
return azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
img->alpha);
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, false, &cmd, azrp_width,
azrp_height)) {
cmd.loop = azrp_image_shader_rgb16_normal;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_rgb16_clearbg(int x, int y, image_t const *img, int eff, int bg)
{
azrp_subimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
bg);
}
void azrp_subimage_rgb16_clearbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 4;
cmd.color_1 = bg_color;
cmd.loop = azrp_image_shader_rgb16_clearbg;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,135 @@
.global _azrp_image_shader_rgb16_clearbg
#include "image_macros.S"
/* RGB16 CLEARBG and DYE, Azur version: by NULL canceling.
This function handles both CLEARBG and DYE, which happen to work identically
on RGB16, save for the fact that the DYE loop ignores the value of opaque
pixels and uses the dye color instead. It's one of the standard 2-unrolled
2-stage-pipeline loops with a right edge, using NULL canceling for
transparency.
r0: [temporary] (CLEARBG) or dye value (DYE)
r7: Right edge pointer
r8: Right edge value
r9: Background color
r10: Nullable output pointer
r11: 0 (to neutralize addc during NULL-cancelling)
r12: Right edge stride
r13: [temporary] (one of the pixels)
r14: [temporary] (one of the pixels in DYE)
The GEN_CLEARBG_LOOP macro parameters are as follows. All of them except for
SRC1 and SRC2 are determined by HFLIP; it's just simpler to set their values
on the macro's call site than have .if statements everywhere. This set of
parameters is used for virtually all the functions of all the formats.
SRC1 and SRC2 are used in DYE mode to replace the pixel values read from
memory with a constant register.
HFLIP: Whether to enable HFLIP
OUT_DIR: Variation of r5 at each loop, either 4 or -4
TMP1: Temporary register for first pixel
TMP2: Temporary register for second pixel
OFF1: Offset for first pixel write
OFF2: Offset for second pixel write
SRC1: Source of first write (here either TMP1 or r0)
SRC2: Source of second write (here either TMP2 or r0) */
.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2, SRC1, SRC2
mov.w @r8+, r7 /* cmd.edge_2 */
shlr r2
mov.l r11, @-r15
mov #0, r11
mov.w @r8+, r9 /* cmd.color_1 */
shll r7
mov.l r10, @-r15
add r5, r7
mov.l r12, @-r15
add #-2, r5 /* Pre-decrement, see output logic */
mov r2, r12
shll2 r12
mov.l r13, @-r15
add r6, r12
mov.l r14, @-r15
add #-2, r4 /* Input stride compensation for pipelining */
.if \HFLIP
mov r2, r0
shll2 r0
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
mov.w @r8+, r0 /* cmd.color_2 */
nop
START
mov.w @r3+, \TMP1
nop
mov.w @r7, r8 /* Save right edge */
nop
cmp/eq \TMP1, r9
nop
2: mov #-1, r10
addc r11, r10
mov.w @r3+, \TMP2
and r5, r10
add #\OUT_DIR, r5
nop
mov.wv \SRC1, \OFF1, r10
cmp/eq \TMP2, r9
mov #-1, r10
addc r11, r10
mov.w @r3+, \TMP1
and r5, r10
cmp/eq \TMP1, r9
3: mov.wv \SRC2, \OFF2, r10
mov.w r8, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r10
mov.l @r15+, r11
EPILOGUE
.endm
#ifndef AZRP_RGB16_DYE
_azrp_image_shader_rgb16_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_DYE_LOOP 0, 4, r0, r13, 2, 0, r0, r13
9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r0, 0, 2, r13, r0
#endif

View File

@ -0,0 +1,12 @@
.global _azrp_image_shader_rgb16_dye
#define AZRP_RGB16_DYE
#include "image_rgb16_clearbg.S"
/* See image_rgb16_clearbg.S for details on this function. */
_azrp_image_shader_rgb16_dye:
tst #1, r0
bf 9f
GEN_CLEARBG_DYE_LOOP 0, 4, r14, r13, 2, 0, r0, r0
9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r14, 0, 2, r0, r0

View File

@ -0,0 +1,26 @@
#include <azur/gint/render.h>
void azrp_image_rgb16_dye(int x, int y, image_t const *img, int eff,
int dye_color)
{
azrp_subimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
dye_color);
}
void azrp_subimage_rgb16_dye(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int dye_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 12;
cmd.color_1 = img->alpha;
cmd.color_2 = dye_color;
cmd.loop = azrp_image_shader_rgb16_dye;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}

View File

@ -0,0 +1,31 @@
#include <azur/gint/render.h>
void azrp_subimage_rgb16_effect(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, ...)
{
va_list args;
va_start(args, eff);
if(eff & IMAGE_CLEARBG) {
int bg = va_arg(args, int);
azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_SWAPCOLOR) {
int c1 = va_arg(args, int);
int c2 = va_arg(args, int);
azrp_subimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff, c1, c2);
}
else if(eff & IMAGE_ADDBG) {
int bg = va_arg(args, int);
azrp_subimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
}
else if(eff & IMAGE_DYE) {
int dye = va_arg(args, int);
azrp_subimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
}
else {
azrp_subimage_rgb16(x, y, img, left, top, w, h, eff);
}
va_end(args);
}

View File

@ -0,0 +1,124 @@
.global _azrp_image_shader_rgb16_normal
#include "image_macros.S"
/* RGB16 Opaque rendering, Azur version: by straightforward copy.
This function of the image renderer is designed for Azur's streaming model
only. Unlike its RAM-model counterpart which is bottlenecked by its writing
speed, this function is entirely limited by the CPU's ability to output the
data in the required format.
In the simple case where there is no color effect and no HFLIP, the task of
rendering a 16-bit opaque image boils down to a 2-dimensional memcpy. This
task can be optimized by moving longwords if the source and destination and
co-4-aligned, with four variations depending on the width and initial
position, identified by the following parameters:
* w1 / w2 denotes the parity of the command width;
* o2 / o4 denotes the alignment of the output.
It is easy to see that when input and output are not co-aligned, any attempt
to combine two word reads into a single long write requires at least 3
cycles per 2 pixels and needs parallelism over several pixels to not get
immediately shut down by the LS-to-EX delay. Here we decide to naively copy
by words, which achieves 4 cycles per 2 pixels, mainly because large RGB16
images are very quickly bottlenecked in reading by their own size anyway.
The HFLIP version also needs to rearrange pixels, and is thus performed with
word-based copies in all situations, which is a straightforward process. */
_azrp_image_shader_rgb16_normal:
/* Not a single cycle */
tst #1, r0
bf _BACKWARD_WORD_COPY
mov #8, r0 /* Use the naive method for width 8 */
cmp/ge r2, r0
bt.s _FORWARD_WORD_COPY
nop
mov r5, r0 /* Check if r3 and r5 are co-aligned */
xor r3, r0
/* Not a single cycle */
tst #2, r0
bt _FORWARD_LONG_COPY
_FORWARD_WORD_COPY:
START
2: movs.w @r3+, x0
3: movs.w x0, @r5+
END
EPILOGUE
_FORWARD_LONG_COPY:
shlr r2 /* Test width parity */
mov #2, r0
bt .w1
nop
.w2: tst r0, r3 /* Test alignment of input */
bf .w2d2
.w2d4: START
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END
EPILOGUE
.w2d2: add #-1, r2
nop
START
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END
EPILOGUE
.w1: tst r0, r3 /* Test alignment of input */
bf .w1d2
.w1d4: START
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END
EPILOGUE
.w1d2: START
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END
EPILOGUE
_BACKWARD_WORD_COPY:
mov r2, r0
shll r0
add r0, r5
nop
shll r0
nop
add r0, r6
nop
START
2: movs.w @r3+, x0
3: movs.w x0, @-r5
END
EPILOGUE

View File

@ -0,0 +1,116 @@
.global _azrp_image_shader_rgb16_swapcolor
#include "image_macros.S"
/* RGB16 SWAPCOLOR, Azur version: by branchless xor selection.
The xor selection is explained in gint's version of P8 SWAPCOLOR. This
version's selection is slightly simpler because we don't have to index the
palette to find the source color. We use a 2-unrolled 2-stage-pipeline loop
to optimize for CPU speed.
r7: Right edge pointer
r8: Right edge value
r9: cmd.color_1
r10: Holds (x ^ y) & -(c == x) during selection
r11: cmd.color_1 ^ cmd.color_2 (ie. x ^ y)
r12: Right edge stride
r13: [temporary] */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.w @r8+, r7 /* cmd.edge_2 */
shlr r2
mov.l r11, @-r15
add #-2, r4 /* Input stride compensation for pipelining */
mov.w @r8+, r9 /* cmd.color_1 */
shll r7
mov.l r10, @-r15
add r5, r7
mov.l r12, @-r15
add #-2, r5 /* Predecrement, see output logic */
mov.w @r8+, r11 /* cmd.color_2 */
mov r2, r12
mov.l r13, @-r15
shll2 r12
add r6, r12
nop
xor r9, r11
nop
.if \HFLIP
mov r2, r0
shll2 r0
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
START
mov.w @r3+, \TMP1
nop
mov.w @r7, r8 /* Save right edge */
nop
cmp/eq \TMP1, r9
nop
2: subc r10, r10
nop
and r11, r10
mov.w @r3+, \TMP2
xor r10, \TMP1
nop
mov.wv \TMP1 \OFF1 r5
cmp/eq \TMP2, r9
add #\OUT_DIR, r5
nop
subc r10, r10
nop
and r11, r10
mov.w @r3+, \TMP1
xor r10, \TMP2
nop
cmp/eq \TMP1, r9
3: mov.wv \TMP2 \OFF2 r5
mov.w r8, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r10
mov.l @r15+, r11
EPILOGUE
.endm
_azrp_image_shader_rgb16_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 4, r0, r13, 2, 0
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r0, 0, 2

View File

@ -0,0 +1,51 @@
#include <azur/gint/render.h>
void azrp_image_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
int old_color, int new_color)
{
azrp_subimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
eff, old_color, new_color);
}
void azrp_subimage_rgb16_swapcolor(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int old_color, int new_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = old_color;
cmd.color_2 = new_color;
cmd.loop = azrp_image_shader_rgb16_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}
void azrp_image_rgb16_addbg(int x, int y, image_t const *img, int eff,
int bg_color)
{
azrp_subimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
eff, bg_color);
}
void azrp_subimage_rgb16_addbg(int x, int y, image_t const *img,
int left, int top, int w, int h, int eff, int bg_color)
{
prof_enter(azrp_perf_cmdgen);
struct gint_image_box box = { x, y, w, h, left, top };
struct gint_image_cmd cmd;
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
azrp_height)) {
cmd.effect += 8;
cmd.color_1 = img->alpha;
cmd.color_2 = bg_color;
cmd.loop = azrp_image_shader_rgb16_swapcolor;
azrp_queue_image(&box, img, &cmd);
}
prof_leave(azrp_perf_cmdgen);
}