forked from Lephenixnoir/Azur
azur: image shader with dynamic effects, and 16-row fragment
This commit is contained in:
parent
e124719de3
commit
8ac9ac747a
|
@ -28,10 +28,32 @@ endif()
|
|||
if(AZUR_GRAPHICS_GINT_CG)
|
||||
list(APPEND SOURCES
|
||||
src/gint/render.c
|
||||
src/gint/r61524.s
|
||||
# Clear shader
|
||||
src/gint/shaders/clear.c
|
||||
src/gint/shaders/clear.S
|
||||
# Image shader
|
||||
src/gint/shaders/image.c
|
||||
src/gint/shaders/image.S)
|
||||
src/gint/shaders/image_rgb16_normal.S
|
||||
src/gint/shaders/image_rgb16_clearbg.S
|
||||
src/gint/shaders/image_rgb16_swapcolor.S
|
||||
src/gint/shaders/image_rgb16_dye.S
|
||||
src/gint/shaders/image_p8_normal.S
|
||||
src/gint/shaders/image_p8_swapcolor.S
|
||||
src/gint/shaders/image_p4_normal.S
|
||||
# Image shader interface
|
||||
src/gint/shaders/image_rgb16.c
|
||||
src/gint/shaders/image_rgb16_effect.c
|
||||
src/gint/shaders/image_rgb16_swapcolor.c
|
||||
src/gint/shaders/image_rgb16_dye.c
|
||||
src/gint/shaders/image_p8.c
|
||||
src/gint/shaders/image_p8_effect.c
|
||||
src/gint/shaders/image_p8_swapcolor.c
|
||||
src/gint/shaders/image_p8_dye.c
|
||||
src/gint/shaders/image_p4.c
|
||||
src/gint/shaders/image_p4_effect.c
|
||||
src/gint/shaders/image_p4_swapcolor.c
|
||||
src/gint/shaders/image_p4_dye.c)
|
||||
endif()
|
||||
|
||||
add_library(azur STATIC ${SOURCES})
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//---
|
||||
// azur.defs: Generation definitions
|
||||
// azur.defs: General definitions that are included in every file
|
||||
//---
|
||||
|
||||
/* This exposes compile-time configuration symbols. I don't like running the
|
||||
|
|
|
@ -33,8 +33,8 @@
|
|||
#include <azur/defs.h>
|
||||
AZUR_BEGIN_DECLS
|
||||
|
||||
#include <gint/defs/types.h>
|
||||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
#include <libprof.h>
|
||||
|
||||
|
@ -45,7 +45,7 @@ AZUR_BEGIN_DECLS
|
|||
typedef void azrp_shader_t(void *uniforms, void *command, void *fragment);
|
||||
|
||||
/* Video memory fragment used as rendering target (in XRAM). */
|
||||
extern uint16_t azrp_frag[];
|
||||
extern uint16_t *azrp_frag;
|
||||
|
||||
/* Maximum number of commands that can be queued. (This is only one of two
|
||||
limits, the other being the size of the command data.) */
|
||||
|
@ -128,19 +128,19 @@ extern int azrp_frag_height;
|
|||
The settings on each mode are as follow:
|
||||
|
||||
* x1: Display resolution: 396x224
|
||||
Fragment size: 8 rows (6336 bytes)
|
||||
Fragment size: 16 rows (12672 bytes)
|
||||
Number of fragments: 28 (29 if an offset is used)
|
||||
Total size of graphics data: 177.408 kB
|
||||
Total size of graphics data: 177'408 bytes
|
||||
|
||||
* x2: Display resolution: 198x112
|
||||
Fragment size: 16 rows (6336 bytes)
|
||||
Fragment size: 16 rows (6336 bytes) # TODO: increase
|
||||
Number of fragments 7 (8 if an offset if used)
|
||||
Total size of graphics data: 44.352 kB
|
||||
Total size of graphics data: 44'352 bytes
|
||||
|
||||
* x3: Display resolution: 132x75 (last row only has 2/3 pixels)
|
||||
Fragment size: 16 rows (4224 bytes)
|
||||
Fragment size: 16 rows (4224 bytes) # TODO: increase
|
||||
Number of fragments: 5 (sometimes 6 if an offset is used)
|
||||
Total size of graphics data: 19.800 kB
|
||||
Total size of graphics data: 19'800 bytes
|
||||
|
||||
As one would know when playing modern video games, super-resolution is one
|
||||
of the most useful ways to increase performance. The reduced amount of
|
||||
|
@ -167,30 +167,50 @@ void azrp_config_scale(int scale);
|
|||
@offset Fragment offset along the y-axis (0 ... height of fragment-1). */
|
||||
void azrp_config_frag_offset(int offset);
|
||||
|
||||
//---
|
||||
// Hooks
|
||||
//---
|
||||
|
||||
/* Hook called before a fragment is sent to the display. The fragment can be
|
||||
accessed and modified freeely (however, the time spent in the hook is
|
||||
counted as overhead and only part of [azrp_perf_render]). */
|
||||
typedef void azrp_hook_prefrag_t(int id, void *fragment, int size);
|
||||
|
||||
/* Get or set the prefrag hook. */
|
||||
azrp_hook_prefrag_t *azrp_hook_get_prefrag(void);
|
||||
void azrp_hook_set_prefrag(azrp_hook_prefrag_t *);
|
||||
|
||||
//---
|
||||
// Standard shaders
|
||||
//---
|
||||
|
||||
/* Clears the entire output with a single color */
|
||||
/* Clears the entire output with a single color */
|
||||
extern uint8_t AZRP_SHADER_CLEAR;
|
||||
/* Renders RGB565 textures/images */
|
||||
extern uint8_t AZRP_SHADER_IMAGE;
|
||||
/* Renders gint images with various dynamic effects */
|
||||
extern uint8_t AZRP_SHADER_IMAGE_RGB16;
|
||||
extern uint8_t AZRP_SHADER_IMAGE_P8;
|
||||
extern uint8_t AZRP_SHADER_IMAGE_P4;
|
||||
|
||||
/* azrp_clear(): Clear output [ARZP_SHADER_CLEAR] */
|
||||
void azrp_clear(uint16_t color);
|
||||
|
||||
/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE] */
|
||||
/* azrp_image(): Queue image command [AZRP_SHADER_IMAGE_*] */
|
||||
void azrp_image(int x, int y, bopti_image_t const *image);
|
||||
|
||||
/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE] */
|
||||
/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_IMAGE_*] */
|
||||
void azrp_subimage(int x, int y, bopti_image_t const *image,
|
||||
int left, int top, int width, int height, int flags);
|
||||
|
||||
/* See below for more detailed image functions. Dynamic effects are provided
|
||||
with the same naming convention as gint. */
|
||||
|
||||
/* Functions to update uniforms for these shaders. You should call them when:
|
||||
* AZRP_SHADER_CLEAR: Changing super-scaling settings.
|
||||
* AZRP_SHADER_IMAGE: Changing super-scaling or or fragment offsets. */
|
||||
* AZRP_SHADER_IMAGE_*: Changing super-scaling or or fragment offsets. */
|
||||
void azrp_shader_clear_configure(void);
|
||||
void azrp_shader_image_configure(void);
|
||||
void azrp_shader_image_rgb16_configure(void);
|
||||
void azrp_shader_image_p8_configure(void);
|
||||
void azrp_shader_image_p4_configure(void);
|
||||
|
||||
//---
|
||||
// Performance indicators
|
||||
|
@ -250,32 +270,79 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
|
|||
exceeded. */
|
||||
bool azrp_queue_command(void *command, size_t size, int fragment, int count);
|
||||
|
||||
/* azrp_queue_image(): Split and queue a gint image command
|
||||
|
||||
The command must have been completely prepared with gint_image_mkcmd() and
|
||||
have had its color effect sections filled. This function sets the shader ID
|
||||
and adjusts the command for fragmented rendering. */
|
||||
void azrp_queue_image(struct gint_image_box *box, image_t const *img,
|
||||
struct gint_image_cmd *cmd);
|
||||
|
||||
//---
|
||||
// Internal shader definitions (for reference; no API guarantee)
|
||||
// Internal R61524 functions
|
||||
//---
|
||||
|
||||
struct azrp_shader_image_command {
|
||||
uint8_t shader_id;
|
||||
/* First edge-preserved pixel offset (P4 only) */
|
||||
int8_t edge1;
|
||||
/* Pixels per line */
|
||||
int16_t columns;
|
||||
/* Address of the image structure */
|
||||
bopti_image_t const *image;
|
||||
/* Destination in XRAM (offset) */
|
||||
uint16_t output;
|
||||
/* Number of lines */
|
||||
int16_t lines;
|
||||
/* Already offset by start row and column */
|
||||
void const *input;
|
||||
void azrp_r61524_fragment_x1(void *fragment, int size);
|
||||
|
||||
/* Info for structure update between fragments: */
|
||||
int16_t height;
|
||||
int16_t row_stride;
|
||||
int16_t x;
|
||||
void azrp_r61524_fragment_x2(void *fragment, int width, int height);
|
||||
|
||||
/* Second edge-preserved pixel offset (P4 only) */
|
||||
int16_t edge2;
|
||||
};
|
||||
//---
|
||||
// Internal functions for the image shader
|
||||
//
|
||||
// We use gint's image rendering API but replace some of the core loops with
|
||||
// Azur-specific versions that are faster in the CPU-bound context of this
|
||||
// rendering engine. Some of the main loops from Azur actually perform better
|
||||
// in RAM than bopti used to do, and are already in gint.
|
||||
//---
|
||||
|
||||
/* azrp_image_effect(): Generalized azrp_image() with dynamic effects */
|
||||
#define azrp_image_effect(x, y, img, eff, ...) \
|
||||
azrp_image_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
|
||||
##__VA_ARGS__)
|
||||
/* azrp_subimage_effect(): Generalized azrp_subimage() with dynamic effects */
|
||||
void azrp_subimage_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int effects, ...);
|
||||
|
||||
/* Specific versions for each format */
|
||||
#define AZRP_IMAGE_SIG1(NAME, ...) \
|
||||
void azrp_image_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
|
||||
void azrp_subimage_ ## NAME(int x, int y, image_t const *img, \
|
||||
int left, int top, int w, int h, ##__VA_ARGS__);
|
||||
#define AZRP_IMAGE_SIG(NAME, ...) \
|
||||
AZRP_IMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
|
||||
AZRP_IMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
|
||||
AZRP_IMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
|
||||
|
||||
AZRP_IMAGE_SIG(_effect, int effects, ...)
|
||||
AZRP_IMAGE_SIG(, int effects)
|
||||
AZRP_IMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
|
||||
AZRP_IMAGE_SIG(_swapcolor, int effects, int source, int replacement)
|
||||
AZRP_IMAGE_SIG(_addbg, int effects, int bg_color)
|
||||
AZRP_IMAGE_SIG(_dye, int effects, int dye_color)
|
||||
|
||||
#define azrp_image_rgb16_effect(x, y, img, eff, ...) \
|
||||
azrp_subimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
#define azrp_image_p8_effect(x, y, img, eff, ...) \
|
||||
azrp_subimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
#define azrp_image_p4_effect(x, y, img, eff, ...) \
|
||||
azrp_subimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
|
||||
#undef AZRP_IMAGE_SIG
|
||||
#undef AZRP_IMAGE_SIG1
|
||||
|
||||
/* Main loop provided by Azur; as usual, these are not real functions; their
|
||||
only use is as the [.loop] field of a command. */
|
||||
|
||||
void azrp_image_shader_rgb16_normal(void);
|
||||
void azrp_image_shader_rgb16_clearbg(void);
|
||||
void azrp_image_shader_rgb16_swapcolor(void);
|
||||
void azrp_image_shader_rgb16_dye(void);
|
||||
void azrp_image_shader_p8_normal(void);
|
||||
void azrp_image_shader_p8_swapcolor(void);
|
||||
void azrp_image_shader_p4_normal(void);
|
||||
void azrp_image_shader_p4_clearbg(void);
|
||||
|
||||
AZUR_END_DECLS
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
.section .ilram, "ax"
|
||||
|
||||
.balign 4
|
||||
.global _azrp_r61524_fragment_x1
|
||||
_azrp_r61524_fragment_x1:
|
||||
mov.l .R61524_DATA, r2
|
||||
shlr r5
|
||||
|
||||
ldrs 1f
|
||||
ldre 2f
|
||||
ldrc r5
|
||||
nop
|
||||
|
||||
/* Read a word from XRAM */
|
||||
1: mov.l @r4+, r0
|
||||
/* Write that word to the display */
|
||||
2: mov.l r0, @r2
|
||||
|
||||
rts
|
||||
nop
|
||||
|
||||
.balign 4
|
||||
.global _azrp_r61524_fragment_x2
|
||||
_azrp_r61524_fragment_x2:
|
||||
mov.l .R61524_DATA, r2
|
||||
nop
|
||||
|
||||
/* Read a word, write it twice */
|
||||
ldrs 1f
|
||||
ldre 2f
|
||||
ldrc r5
|
||||
nop
|
||||
|
||||
1: mov.w @r4+, r0
|
||||
nop
|
||||
mov.w r0, @r2
|
||||
nop
|
||||
mov.w r0, @r2
|
||||
2: nop
|
||||
|
||||
sub r5, r4
|
||||
sub r5, r4
|
||||
|
||||
/* Do that again on a second line */
|
||||
ldrs 3f
|
||||
ldre 4f
|
||||
ldrc r5
|
||||
nop
|
||||
|
||||
3: mov.w @r4+, r0
|
||||
nop
|
||||
mov.w r0, @r2
|
||||
nop
|
||||
mov.w r0, @r2
|
||||
4: nop
|
||||
|
||||
dt r6
|
||||
bf _azrp_r61524_fragment_x2
|
||||
|
||||
rts
|
||||
nop
|
||||
|
||||
.balign 4
|
||||
.R61524_DATA:
|
||||
.long 0xb4000000
|
|
@ -7,11 +7,8 @@
|
|||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define YRAM ((void *)0xe5017000)
|
||||
|
||||
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
|
||||
TODO: Extend this to 16 rows, and move the rest to RAM */
|
||||
GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
|
||||
/* 16 rows of video memory, occupying 12736/16384 bytes or XYRAM (77.7%). */
|
||||
uint16_t *azrp_frag = (void *)0xe500e000 + 32;
|
||||
|
||||
/* Super-scaling factor, width and height of output. */
|
||||
int azrp_scale;
|
||||
|
@ -22,27 +19,33 @@ int azrp_frag_count;
|
|||
/* Height of fragment. */
|
||||
int azrp_frag_height;
|
||||
|
||||
/* TODO: Either make command queue private or use azrp_ prefix */
|
||||
|
||||
/* Number and total size of queued commands. */
|
||||
GXRAM int commands_count = 0, commands_length = 0;
|
||||
static int commands_count=0, commands_length=0;
|
||||
|
||||
/* Array of pointers to queued commands (stored as an offset into YRAM). */
|
||||
GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
|
||||
/* Array of pointers to queued commands. Each command has:
|
||||
* Top 16 bits: fragment number
|
||||
* Bottom 16 bits: offset into command data buffer
|
||||
Rendering order is integer order. */
|
||||
static uint32_t commands_array[AZRP_MAX_COMMANDS];
|
||||
|
||||
static GALIGNED(4) uint8_t commands_data[8192];
|
||||
|
||||
/* Array of shader programs and uniforms. */
|
||||
GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
|
||||
GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
|
||||
static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
|
||||
static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
|
||||
|
||||
/* Next free index in the shader program array. */
|
||||
GXRAM static uint16_t shaders_next = 0;
|
||||
static uint16_t shaders_next = 0;
|
||||
|
||||
/* Hooks. */
|
||||
static azrp_hook_prefrag_t *azrp_hook_prefrag = NULL;
|
||||
|
||||
/* Performance counters. */
|
||||
GXRAM prof_t azrp_perf_cmdgen;
|
||||
GXRAM prof_t azrp_perf_sort;
|
||||
GXRAM prof_t azrp_perf_shaders;
|
||||
GXRAM prof_t azrp_perf_r61524;
|
||||
GXRAM prof_t azrp_perf_render;
|
||||
prof_t azrp_perf_cmdgen;
|
||||
prof_t azrp_perf_sort;
|
||||
prof_t azrp_perf_shaders;
|
||||
prof_t azrp_perf_r61524;
|
||||
prof_t azrp_perf_render;
|
||||
|
||||
//---
|
||||
// High and low-level pipeline functions
|
||||
|
@ -110,25 +113,23 @@ void azrp_render_fragments(void)
|
|||
while(1) {
|
||||
while(cmd < next_frag_threshold && i < commands_count) {
|
||||
azrp_commands_total++;
|
||||
uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
|
||||
uint8_t *data = commands_data + (cmd & 0xffff);
|
||||
prof_enter_norec(azrp_perf_shaders);
|
||||
shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
|
||||
prof_leave_norec(azrp_perf_shaders);
|
||||
|
||||
if(data[0] == AZRP_SHADER_IMAGE) {
|
||||
struct azrp_shader_image_command *cmd = (void *)data;
|
||||
cmd->height -= cmd->lines;
|
||||
cmd->input += cmd->row_stride * cmd->lines;
|
||||
cmd->lines = min(cmd->height, azrp_frag_height);
|
||||
cmd->output = 2 * cmd->x;
|
||||
}
|
||||
|
||||
cmd = commands_array[++i];
|
||||
}
|
||||
|
||||
/* TODO: Consider xram_frame() by DMA in parallel? */
|
||||
if(azrp_hook_prefrag) {
|
||||
int size = azrp_width * azrp_frag_height * 2;
|
||||
(*azrp_hook_prefrag)(frag, azrp_frag, size);
|
||||
}
|
||||
|
||||
prof_enter_norec(azrp_perf_r61524);
|
||||
xram_frame(azrp_frag, 396 * 8);
|
||||
if(azrp_scale == 1)
|
||||
azrp_r61524_fragment_x1(azrp_frag, 396 * azrp_frag_height);
|
||||
else if(azrp_scale == 2)
|
||||
azrp_r61524_fragment_x2(azrp_frag, azrp_width, azrp_frag_height);
|
||||
prof_leave_norec(azrp_perf_r61524);
|
||||
|
||||
if(++frag >= azrp_frag_count) break;
|
||||
|
@ -149,10 +150,12 @@ void azrp_update(void)
|
|||
// Configuration calls
|
||||
//---
|
||||
|
||||
// TODO: Use larger fragments in upscales x2 and x3
|
||||
|
||||
static void update_frag_count(void)
|
||||
{
|
||||
if(azrp_scale == 1)
|
||||
azrp_frag_count = 28 + (azrp_frag_offset > 0);
|
||||
azrp_frag_count = 14 + (azrp_frag_offset > 0);
|
||||
else if(azrp_scale == 2)
|
||||
azrp_frag_count = 7 + (azrp_frag_offset > 0);
|
||||
else if(azrp_scale == 3)
|
||||
|
@ -162,7 +165,7 @@ static void update_frag_count(void)
|
|||
static void update_size(void)
|
||||
{
|
||||
if(azrp_scale == 1)
|
||||
azrp_width = 396, azrp_height = 198, azrp_frag_height = 8;
|
||||
azrp_width = 396, azrp_height = 224, azrp_frag_height = 16;
|
||||
else if(azrp_scale == 2)
|
||||
azrp_width = 198, azrp_height = 112, azrp_frag_height = 16;
|
||||
else if(azrp_scale == 3)
|
||||
|
@ -194,6 +197,20 @@ static void default_settings(void)
|
|||
azrp_config_scale(1);
|
||||
}
|
||||
|
||||
//---
|
||||
// Hooks
|
||||
//---
|
||||
|
||||
azrp_hook_prefrag_t *azrp_hook_get_prefrag(void)
|
||||
{
|
||||
return azrp_hook_prefrag;
|
||||
}
|
||||
|
||||
void azrp_hook_set_prefrag(azrp_hook_prefrag_t *hook)
|
||||
{
|
||||
azrp_hook_prefrag = hook;
|
||||
}
|
||||
|
||||
//---
|
||||
// Custom shaders
|
||||
//---
|
||||
|
@ -226,7 +243,7 @@ bool azrp_queue_command(void *command, size_t size, int fragment, int count)
|
|||
if(commands_length + size >= 8192)
|
||||
return false;
|
||||
|
||||
uint8_t *dst = YRAM + commands_length;
|
||||
uint8_t *dst = commands_data + commands_length;
|
||||
uint8_t *src = command;
|
||||
|
||||
for(size_t i = 0; i < size; i++)
|
||||
|
|
|
@ -1,727 +0,0 @@
|
|||
/* Azur's built-in shaders: <image>
|
||||
|
||||
If there ever was a fantastic piece of assembler engineering in my work up
|
||||
to this point, this would be it. Every trick in the book is used here, from
|
||||
clever instruction combinations, pipeline flow and tricky DSP abuse all the
|
||||
way up to memory layout planning, transforms on loop structures, and most
|
||||
critically superscalar parallelism.
|
||||
|
||||
While the performance of the shader is not *strictly* proportional to the
|
||||
speed of the tightest loop, it's very close. The use of operand-bus XRAM for
|
||||
graphics data, systematic alignment, and detailed pipeline stalling
|
||||
measurements for common instruction sequences in gintctl allow very accurate
|
||||
speed predictions to be made based on the tightness of the code.
|
||||
|
||||
The palette formats of bopti have been refined for the purpose of this
|
||||
shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
|
||||
and P4 being renamed P4_RGB565A with minimal changes along with a variation
|
||||
aptly named P4_RGB565.
|
||||
|
||||
The asymptotic performance for each format is as follows:
|
||||
* RGB565: 1 cycle/pixel if source and destination align
|
||||
2 cycles/pixel otherwise
|
||||
* RGB565A: 4 cycles/pixel
|
||||
* P8_RGB565A: 4.5 cycles/pixel
|
||||
* P8_RGB565: 3 cycles/pixel
|
||||
* P4_RGB565A: 5 cycles/pixel
|
||||
* P4_RGB565: 3.5 cycles/pixel
|
||||
|
||||
Entirely documenting this code would take me hours, but some elements are
|
||||
provided in the comments. Superscalar parallelism is most easily appreciated
|
||||
by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
|
||||
structural technique at play in this code is loop transforms.
|
||||
|
||||
Basically, a loop that loads a pixel, performs computations with it, and
|
||||
writes the result is inefficient because of the RAW dependencies on most
|
||||
operations (with full stall cycles between loads and computations, and
|
||||
between computations and uses as addresses). Well-established loop
|
||||
optimization literature has lots of techniques to help with this problem,
|
||||
and I use two here:
|
||||
|
||||
* _Pipelining_ the loop consists in handling a single pixel over several
|
||||
iterations by doing a little bit of work in each iteration. The data for
|
||||
the pixel would move from register to register at each iteration, with the
|
||||
loop code doing one stage's worth of computation on each register. (You
|
||||
can view it as a diagonal iteration pattern in the pixel*instruction grid
|
||||
if you like such visualizations.)
|
||||
|
||||
By increasing the number of pixels in the pipeline, a lot of independent
|
||||
data can be obtained, reducing dependency pressure and allowing for
|
||||
greater parallelism at the cost of more registers being used.
|
||||
|
||||
The use of pipelining in this shader is very modest, with 2 stages at
|
||||
most, and usually only a couple of instructions being performed in advance
|
||||
for the next pixel while the current one finishes processing. Register
|
||||
assignments have some subtleties though since pressure is high overall.
|
||||
|
||||
* _Unrolling_ iterations of the loop consists in loading two (or more)
|
||||
pixels at the start of each iteration so that we can work on one while
|
||||
waiting for stalls and dependencies on the other.
|
||||
|
||||
Unlike pipelining, a loop iteration starts and ends with full pixels and
|
||||
no work carries between iterations. Unrolling allows different pixels to
|
||||
use different registers and generally better optimize the instruction
|
||||
sequence, at the cost of only supporting pixel counts that are multipes of
|
||||
the unrolling level.
|
||||
|
||||
Handling non-multiple sizes is the everlasting bane of unrolled loops,
|
||||
sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
|
||||
to only handle even sizes and neutralize unwanted pixels after the fact.
|
||||
|
||||
Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
|
||||
almost all formats (except RGB556A which performs DSP trickery).
|
||||
*/
|
||||
|
||||
.global _azrp_shader_image
|
||||
.align 4
|
||||
|
||||
/* Register assignment
|
||||
r0: (temporary)
|
||||
r1: Lines
|
||||
r2: Command queue; (temporary)
|
||||
r3: Input
|
||||
r4: [parameter] azrp_width*2; output stride
|
||||
r5: [parameter] Command queue; Output
|
||||
r6: [parameter] azrp_frag; alpha value; (temporary)
|
||||
r7: Columns
|
||||
r8: Image pointer; (temporary)
|
||||
r9: Input stride */
|
||||
_azrp_shader_image:
|
||||
mov.l r8, @-r15
|
||||
add #2, r5
|
||||
|
||||
mov.l r9, @-r15
|
||||
mov r5, r2
|
||||
|
||||
mov.w @r2+, r7 /* command.columns */
|
||||
|
||||
mov.l @r2+, r8 /* command.image */
|
||||
|
||||
mov.w @r2+, r5 /* command.output (offset) */
|
||||
sub r7, r4
|
||||
|
||||
mov.w @r8+, r9 /* image.profile */
|
||||
sub r7, r4
|
||||
|
||||
mov.w @r2+, r1 /* command.lines */
|
||||
add r6, r5
|
||||
|
||||
mov.l @r2+, r3 /* command.input (pointer) */
|
||||
shll2 r9
|
||||
|
||||
mova .formats, r0
|
||||
|
||||
mov.w @r8+, r6 /* image.alpha */
|
||||
|
||||
mov.l @(r0,r9), r0
|
||||
|
||||
mov.w @r8+, r9 /* image.width */
|
||||
|
||||
jmp @r0
|
||||
nop
|
||||
|
||||
.align 4
|
||||
.formats:
|
||||
.long _RGB565
|
||||
.long _RGB565A
|
||||
.long _NOP /* P8 */
|
||||
.long _P4_RGB565A /* =P4 */
|
||||
.long _P8_RGB565
|
||||
.long _P8_RGB565A
|
||||
.long _P4_RGB565
|
||||
|
||||
/* [Loop macros]
|
||||
|
||||
The following macros implement the main loop of the image renderer.
|
||||
* Each line is rendered in the tight loop between 2: and 3: (both included).
|
||||
* r5 is the output (with stride r4, in bytes)
|
||||
* r3 is the input (with stride r9, in bytes)
|
||||
* There are r1 rows with r7 iterations each */
|
||||
|
||||
#define START() \
|
||||
nop; /* 4-alignment */ \
|
||||
ldrs 2f; \
|
||||
ldre 3f; \
|
||||
1: ldrc r7
|
||||
|
||||
#define END_NORET() \
|
||||
dt r1; \
|
||||
add r4, r5; \
|
||||
bf.s 1b; \
|
||||
add r9, r3
|
||||
|
||||
#define END() \
|
||||
END_NORET(); \
|
||||
mov.l @r15+, r9; \
|
||||
rts; \
|
||||
mov.l @r15+, r8
|
||||
|
||||
/* [Rendering strategy for the RGB565 format]
|
||||
|
||||
In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
|
||||
optimize by moving longwords. Since longwords are pairs of pixels, there are
|
||||
variations and subcases based on the parity of each parameter:
|
||||
|
||||
* w[eo] denotes whether the width of the image is even or odd;
|
||||
* d[eo] denotes whether the memory accesses to the source and destination
|
||||
are even (4-aligned) or odd (2-aligned).
|
||||
|
||||
When the destination and source have identical parity, the d[eo] variation
|
||||
can be defined. In this case the copy is pretty direct, it's a longword copy
|
||||
and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
|
||||
start or end address is 2-aligned.
|
||||
|
||||
However, when they have opposite parity, each longword read matches up with
|
||||
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
|
||||
not help because of the stall cycle between loading a register and using it
|
||||
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
|
||||
the word-based copy). Unrolling iterations could help but would be too
|
||||
complex here (adding sub-cases); a super-heavy renderer with more hypotheses
|
||||
(like a tileset shader) should aim for that route though. Also, movua.l
|
||||
followed by mov.l is even slower (5 cycles). */
|
||||
.align 4
|
||||
_RGB565:
|
||||
mov #8, r0 /* Maximum width for naive method */
|
||||
sub r7, r9
|
||||
|
||||
cmp/ge r7, r0
|
||||
|
||||
shll r9
|
||||
|
||||
bt.s _RGB565.naive
|
||||
mov #2, r0
|
||||
|
||||
/* Use naive method for opposite source/destination parity */
|
||||
mov r5, r6
|
||||
xor r3, r6
|
||||
|
||||
tst r0, r6
|
||||
bf _RGB565.naive
|
||||
|
||||
shlr r7
|
||||
bt _RGB565.wo
|
||||
|
||||
_RGB565.we:
|
||||
tst r0, r5
|
||||
bf _RGB565.we_do
|
||||
|
||||
/* This is 4-aligned */
|
||||
_RGB565.we_de:
|
||||
START()
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.we_do:
|
||||
add #-1, r7
|
||||
|
||||
START()
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.wo:
|
||||
tst r0, r5
|
||||
bf _RGB565.wo_do
|
||||
|
||||
_RGB565.wo_de:
|
||||
START()
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.wo_do:
|
||||
START()
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
END()
|
||||
|
||||
/* Naive method for small widths and opposite source/destination parity */
|
||||
.align 4
|
||||
_RGB565.naive:
|
||||
START()
|
||||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @r5+
|
||||
END()
|
||||
|
||||
/* [Rendering strategy for the RGB565A format]
|
||||
|
||||
Since we have to check for the alpha value in each pixel, there's really no
|
||||
longword-based optimization. Instead, we just go as fast as possible with
|
||||
each pixel, using DSP instructions because conditional execution is pretty
|
||||
damn good. This takes 4 cycles/pixel. I tried a number of reductions to
|
||||
3 cycles/pixel but could not get any of them to work. */
|
||||
.align 4
|
||||
_RGB565A:
|
||||
shll16 r6
|
||||
mov #0x0004, r0 /* DC Zero mode */
|
||||
|
||||
sub r7, r9
|
||||
|
||||
shll r9
|
||||
|
||||
lds r6, y0
|
||||
|
||||
lds r0, dsr
|
||||
|
||||
START()
|
||||
2: movs.w @r3+, x0
|
||||
pcmp x0, y0 movx.w @r5, x1
|
||||
dct pcopy x1, x0
|
||||
3: movx.w x0, @r5+
|
||||
END()
|
||||
|
||||
/* [Rendering strategy for the P8_RGB565A format]
|
||||
|
||||
The work needed for each pixel gets more difficult as we go, with alpha
|
||||
being the major culprit due to its additional comparisons, jumps, and
|
||||
limited optimization opportunities when unrolling due to conditionally-
|
||||
executed code.
|
||||
|
||||
Because arithmetic is unavoidable and there are 1-cycle delays between both
|
||||
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
|
||||
iterations with a 2-stage pipeline structure. This fills the stall cycles
|
||||
and increases parallelism significantly. Pure loop optimization handbook.
|
||||
|
||||
Dealing with odd widths is a major pain as usual. Instead of adding logic to
|
||||
handle the extra pixel separately, this routine lets the loop overwrite it,
|
||||
then restores its original value afterwards - a delightfully elegant trick.
|
||||
|
||||
The P8 format is actually so bad that spending precious time grinding cycles
|
||||
felt completely inappropriate without first refining it. This led to two new
|
||||
variations, P8_RGB565 and P8_RGB565A, which fix the following problems.
|
||||
|
||||
-> First there is alpha for all images, which is the most costly feature,
|
||||
single-handedly accounting for half of the work per pixel. P8_RGB565
|
||||
does no support alpha, which basically doubles performance.
|
||||
|
||||
-> Then, there is the alpha value itself. In P8 it is a variable (and fxconv
|
||||
sets it to 0xff), which burns a register for the comparison and enforces
|
||||
a fixed order between comparison and left-shift. P8_RGB565A always sets
|
||||
an alpha value of 0x00 which lifts both constraints.
|
||||
|
||||
-> Then, there are palette indices. In P8 they are unsigned, which requires
|
||||
an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign-
|
||||
extended value of the mov.b can be used directly (once doubled). The
|
||||
palette base is simply offset by 128 entries, with colors numbered
|
||||
-128..-1 first and only then 0..127.
|
||||
|
||||
-> Finally, there's the palette itself. In P8 it always has 256 entries,
|
||||
even when only a few are used. For small images this is a huge waste, so
|
||||
P8_RGB565 and P8_RGB565A only store colors that are actually used.
|
||||
|
||||
P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good
|
||||
compared to 4 cycles/pixel for RGB565A. */
|
||||
.align 4
|
||||
_P8_RGB565A:
|
||||
mov.l r13, @-r15
|
||||
sub r7, r9
|
||||
|
||||
mov r7, r13
|
||||
add #-2, r9 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.l r12, @-r15
|
||||
shlr r7
|
||||
|
||||
mov.l r10, @-r15
|
||||
movt r6
|
||||
|
||||
mov.w _P8_RGB565A.palette_distance, r0
|
||||
shll r13
|
||||
|
||||
add r6, r7
|
||||
|
||||
sub r6, r9
|
||||
|
||||
sub r6, r4
|
||||
|
||||
sub r6, r4
|
||||
|
||||
add r0, r8
|
||||
|
||||
add r5, r13
|
||||
mov r7, r2
|
||||
|
||||
add #-4, r5 /* Output offset compensation in the loop */
|
||||
|
||||
shll2 r2
|
||||
|
||||
add r4, r2
|
||||
|
||||
START()
|
||||
|
||||
mov.b @r3+, r6
|
||||
|
||||
/* Save next pixel for the odd-width case */
|
||||
mov.w @r13, r12
|
||||
|
||||
mov.b @r3+, r10
|
||||
tst r6, r6
|
||||
|
||||
/* 2-unrolled 2-stage main loop */
|
||||
2: add r6, r6
|
||||
mov r6, r0
|
||||
|
||||
add r10, r10
|
||||
bt.s 5f
|
||||
|
||||
tst r10, r10
|
||||
mov.w @(r0,r8), r0
|
||||
|
||||
mov.w r0, @(4,r5)
|
||||
|
||||
5: mov.b @r3+, r6
|
||||
mov r10, r0
|
||||
|
||||
bt.s 6f
|
||||
add #4, r5
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
|
||||
mov.w r0, @(2,r5)
|
||||
|
||||
6: mov.b @r3+, r10
|
||||
3: tst r6, r6
|
||||
|
||||
/* Restore last pixel */
|
||||
mov.w r12, @r13
|
||||
add r2, r13
|
||||
|
||||
END_NORET()
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r9
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
|
||||
_P8_RGB565A.palette_distance:
|
||||
/* Distance between image pointer and palette array base */
|
||||
.word 260
|
||||
|
||||
/* [Rendering strategy for the P8_RGB565 format]
|
||||
|
||||
See P8_RGB565A for format details. Removing the checks for transparency and
|
||||
the jumps simplifies the instruction sequence and allows superior
|
||||
parallelism because all paths are unconditional. This routines achieves
|
||||
3 cycles/pixel asymptotically. */
|
||||
.align 4
|
||||
_P8_RGB565:
|
||||
mov.l r13, @-r15
|
||||
sub r7, r9
|
||||
|
||||
mov r7, r13
|
||||
add #-2, r9 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.l r12, @-r15
|
||||
shlr r7
|
||||
|
||||
mov.l r10, @-r15
|
||||
movt r6
|
||||
|
||||
mov.w _P8_RGB565.palette_distance, r0
|
||||
shll r13
|
||||
|
||||
add r6, r7
|
||||
|
||||
sub r6, r9
|
||||
|
||||
sub r6, r4
|
||||
|
||||
sub r6, r4
|
||||
|
||||
add r0, r8
|
||||
|
||||
add r5, r13
|
||||
|
||||
add #-4, r5 /* Output offset compensation in the loop */
|
||||
mov r7, r2
|
||||
|
||||
shll2 r2
|
||||
|
||||
add r4, r2
|
||||
|
||||
START()
|
||||
|
||||
mov.b @r3+, r0
|
||||
|
||||
/* Save next pixel for the odd-width case */
|
||||
mov.w @r13, r12
|
||||
|
||||
mov.b @r3+, r10
|
||||
shll r0
|
||||
|
||||
/* 2-unrolled 2-stage main loop */
|
||||
2: mov.b @r3+, r6
|
||||
shll r10
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
/* This nop is not for show, it actually prevents the loop from slowing
|
||||
down to 7 cycles /i, probably due to instruction reads alignment. */
|
||||
nop
|
||||
|
||||
mov.w r0, @(4,r5)
|
||||
mov r10, r0
|
||||
|
||||
mov.b @r3+, r10
|
||||
add #4, r5
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
shll r6
|
||||
|
||||
mov.w r0, @(2,r5)
|
||||
3: mov r6, r0
|
||||
|
||||
/* Restore last pixel */
|
||||
mov.w r12, @r13
|
||||
add r2, r13
|
||||
|
||||
END_NORET()
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r9
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
|
||||
_P8_RGB565.palette_distance:
|
||||
/* Distance between image pointer and palette array base */
|
||||
.word 260
|
||||
|
||||
/* [Rendering strategy for the P4_RGB565A format]
|
||||
|
||||
This is the most complex format. Most of the remarks that apply to
|
||||
P8_RGB565A also apply here, except that there are less opportunities to save
|
||||
computation because nibbles must be extracted anyway.
|
||||
|
||||
The P4_RGB565A format is simply bopti's P4, but an additional variation
|
||||
P4_RGB565 is specified to save on transparency handling, which is very
|
||||
expensive.
|
||||
|
||||
The special nature of the nibble packing means the simplest loop form writes
|
||||
2 pixels from a 2-aligned source image position in a single iteration. Other
|
||||
structures don't even come close: selecting nibbles individually is folly,
|
||||
while not unrolling is inefficient. So the whole point of this routine is to
|
||||
forcibly align the subimage on a byte-aligned and never break that grid.
|
||||
|
||||
The command builder for P4 does this alignment before submitting the
|
||||
command. Obviously the transform can cause one extra pixel to be overridden
|
||||
on each side of every line. The command is thus extended with two edge
|
||||
offsets indicating pixels to preserve at each end. When overwrites occurs,
|
||||
the edge offsets point to the overwritten pixels so they can be restored.
|
||||
Otherwise, they point to the next pixels and the restores are no-ops. See
|
||||
the strategy used for managing unrolling in P8 formats for details.
|
||||
|
||||
The only irregularity is image width, which the command builder cannot
|
||||
modify. It is rounded up to the next multiple of 2, then halved. There is a
|
||||
nice trick for this operation, which is [shlr rX] then adding T to rX. We
|
||||
also need to add -1 for another adjustement, and both are combined into an
|
||||
addc, which saves one add and one movt off the EX critical chain.
|
||||
|
||||
The main loop achieves 5 cycles/pixel. */
|
||||
.align 4
|
||||
_P4_RGB565A:
|
||||
shlr r7
|
||||
mov.w @(6, r2), r0 /* command.edge2 */
|
||||
|
||||
mov.l r12, @-r15
|
||||
add #-15, r2 /* Go back to start of command */
|
||||
|
||||
mov #-1, r12
|
||||
shlr r9
|
||||
|
||||
mov.l r11, @-r15
|
||||
addc r12, r9
|
||||
|
||||
mov r0, r12
|
||||
add r12, r12
|
||||
|
||||
mov.l r10, @-r15
|
||||
sub r7, r9
|
||||
|
||||
mov.b @r2, r11 /* command.edge1 */
|
||||
add #2, r8 /* image.palette */
|
||||
|
||||
mov.l r13, @-r15
|
||||
mov r5, r0
|
||||
|
||||
mov.l r14, @-r15
|
||||
shll r11
|
||||
|
||||
add #-4, r5
|
||||
nop /* 4-alignment */
|
||||
|
||||
START()
|
||||
|
||||
mov.b @r3+, r6
|
||||
mov r0, r10
|
||||
|
||||
mov.w @(r0,r11), r13
|
||||
|
||||
mov.w @(r0,r12), r14
|
||||
shll r6
|
||||
|
||||
/* Main loop with 2 pixels sharing a single byte */
|
||||
2: mov r6, r0
|
||||
and #0x1e, r0
|
||||
|
||||
tst r0, r0
|
||||
|
||||
bt.s 4f
|
||||
shlr2 r6
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
|
||||
mov.w r0, @(6,r5)
|
||||
4: shlr2 r6
|
||||
|
||||
mov r6, r0
|
||||
and #0x1e, r0
|
||||
|
||||
tst r0, r0
|
||||
mov.b @r3+, r6
|
||||
|
||||
bt.s 5f
|
||||
add #4, r5
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
|
||||
mov.w r0, @r5
|
||||
3: 5: shll r6
|
||||
|
||||
mov r10, r0
|
||||
mov r7, r10
|
||||
|
||||
shll2 r10
|
||||
|
||||
mov.w r13, @(r0,r11)
|
||||
add r4, r10
|
||||
|
||||
mov.w r14, @(r0,r12)
|
||||
add r0, r10
|
||||
|
||||
mov r10, r0
|
||||
/* Parallelizes with [dt r1] expanded from END_NORET() */
|
||||
|
||||
END_NORET()
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r9
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
|
||||
/* [Rendering strategy for the P4_RGB565 format]
|
||||
Same as P4_RGB565A without transparency checks (fairly straightforward). The
|
||||
core loop runs in 3.5 cycles/pixel. */
|
||||
.align 4
|
||||
_P4_RGB565:
|
||||
shlr r7
|
||||
mov.w @(6, r2), r0 /* command.edge2 */
|
||||
|
||||
mov.l r10, @-r15
|
||||
add #-15, r2 /* Go back to start of command */
|
||||
|
||||
mov.l r12, @-r15
|
||||
shlr r9
|
||||
|
||||
add #2, r8 /* image.palette */
|
||||
mov #-1, r12
|
||||
|
||||
mov.l r11, @-r15
|
||||
addc r12, r9
|
||||
|
||||
mov r0, r12
|
||||
add r12, r12
|
||||
|
||||
mov.b @r2, r11 /* command.edge1 */
|
||||
sub r7, r9
|
||||
|
||||
mov.l r13, @-r15
|
||||
mov #0x1e, r2
|
||||
|
||||
mov.l r14, @-r15
|
||||
shll r11
|
||||
|
||||
mov r5, r0
|
||||
add #-4, r5
|
||||
|
||||
START()
|
||||
|
||||
mov.b @r3+, r6
|
||||
mov #-4, r10
|
||||
|
||||
mov.l r0, @-r15
|
||||
|
||||
mov.w @(r0,r11), r13
|
||||
|
||||
mov.w @(r0,r12), r14
|
||||
shll r6
|
||||
|
||||
/* Main loop with 2 pixels sharing a single byte */
|
||||
2: mov r6, r0
|
||||
and #0x1e, r0
|
||||
|
||||
shld r10, r6
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
and r2, r6
|
||||
|
||||
mov.w r0, @(6,r5)
|
||||
mov r6, r0
|
||||
|
||||
mov.b @r3+, r6
|
||||
add #4, r5
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
|
||||
mov.w r0, @r5
|
||||
3: shll r6
|
||||
|
||||
mov.l @r15+, r0
|
||||
mov r7, r10
|
||||
|
||||
shll2 r10
|
||||
|
||||
mov.w r13, @(r0,r11)
|
||||
add r4, r10
|
||||
|
||||
mov.w r14, @(r0,r12)
|
||||
add r0, r10
|
||||
|
||||
mov r10, r0
|
||||
/* Parallelizes with [dt r1] expanded from END_NORET() */
|
||||
|
||||
END_NORET()
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r9
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
|
||||
/* [Unsupported formats]
|
||||
P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
|
||||
_NOP:
|
||||
mov.l @r15+, r9
|
||||
rts
|
||||
mov.l @r15+, r8
|
|
@ -1,88 +1,45 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include <gint/defs/util.h>
|
||||
|
||||
uint8_t AZRP_SHADER_IMAGE = -1;
|
||||
|
||||
__attribute__((constructor))
|
||||
static void register_shader(void)
|
||||
void azrp_queue_image(struct gint_image_box *box, image_t const *img,
|
||||
struct gint_image_cmd *cmd)
|
||||
{
|
||||
extern azrp_shader_t azrp_shader_image;
|
||||
AZRP_SHADER_IMAGE = azrp_register_shader(azrp_shader_image);
|
||||
}
|
||||
|
||||
void azrp_shader_image_configure(void)
|
||||
{
|
||||
azrp_set_uniforms(AZRP_SHADER_IMAGE, (void *)(2 * azrp_width));
|
||||
}
|
||||
|
||||
//---
|
||||
|
||||
/* Profile IDs */
|
||||
#define RGB565 0
|
||||
#define RGB565A 1
|
||||
#define P4_RGB565A 3
|
||||
#define P8_RGB565 4
|
||||
#define P8_RGB565A 5
|
||||
#define P4_RGB565 6
|
||||
|
||||
void azrp_image(int x, int y, bopti_image_t const *image)
|
||||
{
|
||||
azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0);
|
||||
}
|
||||
|
||||
void azrp_subimage(int x, int y, bopti_image_t const *image,
|
||||
int left, int top, int width, int height, int flags)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
|
||||
if(!(flags & DIMAGE_NOCLIP)) {
|
||||
/* TODO: image: clip function */
|
||||
}
|
||||
|
||||
struct azrp_shader_image_command cmd;
|
||||
cmd.shader_id = AZRP_SHADER_IMAGE;
|
||||
cmd.columns = width;
|
||||
cmd.image = image;
|
||||
|
||||
int row_stride;
|
||||
|
||||
if(image->profile == P8_RGB565 || image->profile == P8_RGB565A) {
|
||||
row_stride = image->width;
|
||||
cmd.input = (void *)image->data + (image->data[0] * 2) + 2 +
|
||||
top * row_stride + left;
|
||||
}
|
||||
else if(image->profile == P4_RGB565 || image->profile == P4_RGB565A) {
|
||||
row_stride = (image->width + 1) >> 1;
|
||||
cmd.input = (void *)image->data + 32 + top * row_stride + (left >> 1);
|
||||
|
||||
int odd_left = left & 1;
|
||||
int odd_right = (left + width) & 1;
|
||||
|
||||
cmd.edge1 = -1 + odd_left;
|
||||
cmd.edge2 = width + odd_left;
|
||||
cmd.columns += odd_left + odd_right;
|
||||
x -= odd_left;
|
||||
}
|
||||
else {
|
||||
row_stride = image->width << 1;
|
||||
cmd.input = (void *)image->data + top * row_stride + (left << 1);
|
||||
}
|
||||
/* TODO: Ironically, this loads all 3 entry points */
|
||||
int p = img->profile;
|
||||
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
|
||||
cmd->shader_id = AZRP_SHADER_IMAGE_RGB16;
|
||||
else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
|
||||
cmd->shader_id = AZRP_SHADER_IMAGE_P8;
|
||||
else
|
||||
cmd->shader_id = AZRP_SHADER_IMAGE_P4;
|
||||
|
||||
/* This divides by azrp_frag_height */
|
||||
int fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
|
||||
/* TODO: Have a proper way to do optimized-division by azrp_frag_height */
|
||||
int fragment_id = (azrp_scale == 1) ? (box->y >> 4) : (box->y >> 4);
|
||||
|
||||
/* These settings only apply to the first fragment */
|
||||
int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
|
||||
cmd.lines = azrp_frag_height - first_y;
|
||||
cmd.output = 2 * (azrp_width * first_y + x);
|
||||
int first_y = (box->y + azrp_frag_offset) & (azrp_frag_height - 1);
|
||||
cmd->lines = min(box->h, azrp_frag_height - first_y);
|
||||
cmd->output = (void *)azrp_frag + (azrp_width * first_y + cmd->x) * 2;
|
||||
|
||||
/* Settings for further updates */
|
||||
cmd.height = height;
|
||||
cmd.row_stride = row_stride;
|
||||
cmd.x = x;
|
||||
|
||||
int n = 1 + (height - cmd.lines + azrp_frag_height - 1) / azrp_frag_height;
|
||||
azrp_queue_command(&cmd, sizeof cmd, fragment_id, n);
|
||||
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
int n = 1 + (box->h - cmd->lines + azrp_frag_height-1) / azrp_frag_height;
|
||||
azrp_queue_command(cmd, sizeof *cmd, fragment_id, n);
|
||||
}
|
||||
|
||||
void azrp_subimage(int x, int y, image_t const *img,
|
||||
int left, int top, int width, int height, int flags)
|
||||
{
|
||||
int p = img->profile;
|
||||
|
||||
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A)
|
||||
return azrp_subimage_rgb16(x, y, img, left, top, width, height, flags);
|
||||
if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A)
|
||||
return azrp_subimage_p8(x, y, img, left, top, width, height, flags);
|
||||
if(p == IMAGE_P4_RGB565 || p == IMAGE_P4_RGB565A)
|
||||
return azrp_subimage_p4(x, y, img, left, top, width, height, flags);
|
||||
}
|
||||
|
||||
void azrp_image(int x, int y, image_t const *img)
|
||||
{
|
||||
azrp_subimage(x, y, img, 0, 0, img->width, img->height, 0);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
/* mov.wv: Move at a variable offset. This macro is functionally identical to
|
||||
mov.w \SRC, @(\OFF, \DST)
|
||||
except that when OFF=0 it simplifies into [mov.w \SRC, @\DST] so that SRC is
|
||||
not constrained to be r0. */
|
||||
.macro mov.wv SRC, OFF, DST
|
||||
.if (\OFF == 0)
|
||||
mov.w \SRC, @\DST
|
||||
.else
|
||||
mov.w \SRC, @(\OFF, \DST)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/* START: Sets up the inner and outer loop. The outer loop is anything between
|
||||
the calls to macros START and END, while the inner loop is the code between
|
||||
labels 2: and 3: (both *INCLUDED*). */
|
||||
.macro START
|
||||
ldrs 2f
|
||||
ldre 3f
|
||||
1: ldrc r2
|
||||
nop
|
||||
.endm
|
||||
|
||||
/* END: Finishes the outer loop and adds strides. */
|
||||
.macro END
|
||||
dt r1
|
||||
add r4, r3
|
||||
bf.s 1b
|
||||
add r6, r5
|
||||
.endm
|
||||
|
||||
/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
|
||||
.macro EPILOGUE
|
||||
mov.l @r15+, r9
|
||||
mov r3, r0
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
.endm
|
|
@ -0,0 +1,70 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include <gint/defs/util.h>
|
||||
|
||||
uint8_t AZRP_SHADER_IMAGE_P4 = -1;
|
||||
|
||||
static void shader_p4(void *uniforms, void *command, void *fragment)
|
||||
{
|
||||
struct gint_image_cmd *cmd = (void *)command;
|
||||
cmd->input = gint_image_p4_loop((int)uniforms, cmd);
|
||||
cmd->height -= cmd->lines;
|
||||
cmd->lines = min(cmd->height, azrp_frag_height);
|
||||
cmd->output = fragment + cmd->x * 2;
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
static void register_shader(void)
|
||||
{
|
||||
AZRP_SHADER_IMAGE_P4 = azrp_register_shader(shader_p4);
|
||||
}
|
||||
|
||||
void azrp_shader_image_p4_configure(void)
|
||||
{
|
||||
azrp_set_uniforms(AZRP_SHADER_IMAGE_P4, (void *)azrp_width);
|
||||
}
|
||||
|
||||
void azrp_image_p4(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
azrp_subimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void azrp_subimage_p4(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_P4_RGB565A)
|
||||
return azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.loop = azrp_image_shader_p4_normal;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
|
||||
{
|
||||
azrp_subimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
|
||||
}
|
||||
|
||||
void azrp_subimage_p4_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = gint_image_p4_clearbg_alt;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_p4_dye(int x, int y, image_t const *img, int eff,
|
||||
int dye_color)
|
||||
{
|
||||
azrp_subimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p4_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = gint_image_p4_dye;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_subimage_p4_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int from = va_arg(args, int);
|
||||
int to = va_arg(args, int);
|
||||
azrp_subimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from, to);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
azrp_subimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
azrp_subimage_p4(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
.global _azrp_image_shader_p4_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P4 Opaque rendering, Azur version: trivial with loop transforms.
|
||||
|
||||
This is a pretty direct loop with no difficult tricks involved; it expands
|
||||
on P8 by adding another edge pointer. The main change is the decoding logic
|
||||
which now only involves a single byte to load for every two pixels, but more
|
||||
arithmetic to extract the nibbles.
|
||||
|
||||
All the loops in Azur's P4 functions are obvious EX chains and thus any
|
||||
optimization would need to simplify the arithmetic to gain any half-cycles.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: Right edge value
|
||||
r9: Palette
|
||||
r10: Left edge pointer
|
||||
r11: Left edge value
|
||||
r12: Edge stride
|
||||
r13: [temporary]
|
||||
r14: [temporary] */
|
||||
|
||||
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
shlr r2
|
||||
nop
|
||||
|
||||
add r10, r10
|
||||
nop
|
||||
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
mov r2, r0
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shll2 r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add r5, r10
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-4, r5
|
||||
|
||||
add #-1, r4 /* Input stride compensation for pipelining */
|
||||
nop
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
mov #-4, \TMP2
|
||||
|
||||
mov.w @r7, r8 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.w @r10, r11 /* Save left edge */
|
||||
shll \TMP1
|
||||
|
||||
2: mov \TMP1, r0
|
||||
and #0x1e, r0
|
||||
|
||||
shld \TMP2, \TMP1
|
||||
mov #0x1e, \TMP2
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
and \TMP2, \TMP1
|
||||
|
||||
mov.w r0, @(\OFF1,r5)
|
||||
mov \TMP1, r0
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
mov #-4, \TMP2
|
||||
|
||||
mov.w r0, @(\OFF2,r5)
|
||||
3: shll \TMP1
|
||||
|
||||
mov.w r8, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
mov.w r11, @r10 /* Restore left edge */
|
||||
add r12, r10
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_azrp_image_shader_p4_normal:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_NORMAL_LOOP 0, 4, r13, r14, 6, 0
|
||||
9: GEN_NORMAL_LOOP 1, -4, r13, r14, 0, 6
|
|
@ -0,0 +1,51 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_p4_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
azrp_subimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p4_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_index, int new_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_index;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = gint_image_p4_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_p4_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
azrp_subimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p4_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, true, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = gint_image_p4_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include <gint/defs/util.h>
|
||||
|
||||
uint8_t AZRP_SHADER_IMAGE_P8 = -1;
|
||||
|
||||
static void shader_p8(void *uniforms, void *command, void *fragment)
|
||||
{
|
||||
struct gint_image_cmd *cmd = (void *)command;
|
||||
cmd->input = gint_image_p8_loop((int)uniforms, cmd);
|
||||
cmd->height -= cmd->lines;
|
||||
cmd->lines = min(cmd->height, azrp_frag_height);
|
||||
cmd->output = fragment + cmd->x * 2;
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
static void register_shader(void)
|
||||
{
|
||||
AZRP_SHADER_IMAGE_P8 = azrp_register_shader(shader_p8);
|
||||
}
|
||||
|
||||
void azrp_shader_image_p8_configure(void)
|
||||
{
|
||||
azrp_set_uniforms(AZRP_SHADER_IMAGE_P8, (void *)azrp_width);
|
||||
}
|
||||
|
||||
void azrp_image_p8(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
azrp_subimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void azrp_subimage_p8(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_P8_RGB565A)
|
||||
return azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.loop = azrp_image_shader_p8_normal;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
|
||||
{
|
||||
azrp_subimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
bg);
|
||||
}
|
||||
|
||||
void azrp_subimage_p8_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = gint_image_p8_clearbg;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_p8_dye(int x, int y, image_t const *img, int eff,
|
||||
int dye_color)
|
||||
{
|
||||
azrp_subimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p8_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = gint_image_p8_dye;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_subimage_p8_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int from = va_arg(args, int);
|
||||
int to = va_arg(args, int);
|
||||
azrp_subimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from, to);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
azrp_subimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
azrp_subimage_p8(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
.global _azrp_image_shader_p8_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 Opaque rendering, Azur version: trivial with loop transforms.
|
||||
|
||||
This is fairly straightforward, with no particular tricks; just index the
|
||||
palette as fast as possible in a 2-unrolled 2-stage-pipeline loop that maxes
|
||||
out CPU speed.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: Right edge value
|
||||
r9: Palette
|
||||
r10: [temporary]
|
||||
r11: [temporary]
|
||||
r12: Right edge stride */
|
||||
|
||||
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
shlr r2
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
mov r2, r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll2 r0
|
||||
|
||||
mov.l r10, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
add #-4, r5
|
||||
nop
|
||||
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
nop
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, r0
|
||||
nop
|
||||
|
||||
mov.w @r7, r8 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
shll r0
|
||||
|
||||
2: mov.b @r3+, \TMP2
|
||||
shll \TMP1
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
/* Fun fact: omitting this nop slows the loop to 7 cycles/i */
|
||||
nop
|
||||
|
||||
mov.w r0, @(\OFF1,r5)
|
||||
mov \TMP1, r0
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
shll \TMP2
|
||||
|
||||
mov.w r0, @(\OFF2,r5)
|
||||
3: mov \TMP2, r0
|
||||
|
||||
mov.w r8, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r12
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_azrp_image_shader_p8_normal:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_NORMAL_LOOP 0, 4, r10, r11, 4, 2
|
||||
9: GEN_NORMAL_LOOP 1, -4, r10, r11, 2, 4
|
|
@ -0,0 +1,142 @@
|
|||
.global _azrp_image_shader_p8_swapcolor
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 SWAPCOLOR, Azur version: by branchless xor selection.
|
||||
|
||||
This is essentially the same logic as gint's P8 SWAPCOLOR version, but with
|
||||
a 2-unrolled 2-stage-pipeline since the bottleneck on RAM is now on the CPU.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
|
||||
r9: Palette
|
||||
r10: Holds (x ^ y) & -(c == x) during selection
|
||||
r11: cmd.color_1
|
||||
r12: Right edge stride
|
||||
r13: [temporary]
|
||||
r14: [temporary]
|
||||
|
||||
Spilled to stack:
|
||||
@(-4,r15): Right edge value */
|
||||
|
||||
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
shlr r2
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
mov r2, r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll2 r0
|
||||
|
||||
mov.l r11, @-r15
|
||||
shll r7
|
||||
|
||||
mov.w @r8+, r11 /* cmd.color_1 */
|
||||
add r5, r7
|
||||
|
||||
mov.l r10, @-r15
|
||||
add #-4, r5
|
||||
|
||||
mov.l r13, @-r15
|
||||
exts.b r11, r11
|
||||
|
||||
mov r11, r13
|
||||
add r13, r13
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_2 */
|
||||
add r9, r13
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.w @r13, r13
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.l r14, @-r15
|
||||
nop
|
||||
|
||||
xor r13, r8
|
||||
nop
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
nop
|
||||
|
||||
mov.w @r7, r0 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
cmp/eq \TMP2, r11
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
add \TMP2, \TMP2
|
||||
|
||||
2: subc r10, r10
|
||||
mov \TMP2, r0
|
||||
|
||||
cmp/eq \TMP1, r11
|
||||
mov.w @(r0, r9), r0
|
||||
|
||||
and r8, r10
|
||||
nop
|
||||
|
||||
xor r10, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @(\OFF1, r5)
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
subc r10, r10
|
||||
|
||||
add \TMP1, \TMP1
|
||||
mov \TMP1, r0
|
||||
|
||||
mov.w @(r0, r9), r0
|
||||
cmp/eq \TMP2, r11
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
and r8, r10
|
||||
|
||||
xor r10, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @(\OFF2, r5)
|
||||
3: add \TMP2, \TMP2
|
||||
|
||||
/* TODO: Use x0 as temporary storage by moving the main registers */
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_azrp_image_shader_p8_swapcolor:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 4, 2
|
||||
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 2, 4
|
|
@ -0,0 +1,51 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_p8_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
azrp_subimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p8_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_index, int new_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_index;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = azrp_image_shader_p8_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_p8_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
azrp_subimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_p8_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = azrp_image_shader_p8_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include <gint/defs/util.h>
|
||||
|
||||
uint8_t AZRP_SHADER_IMAGE_RGB16 = -1;
|
||||
|
||||
static void shader_rgb16(void *uniforms, void *command, void *fragment)
|
||||
{
|
||||
struct gint_image_cmd *cmd = (void *)command;
|
||||
cmd->input = gint_image_rgb16_loop((int)uniforms, cmd);
|
||||
cmd->height -= cmd->lines;
|
||||
cmd->lines = min(cmd->height, azrp_frag_height);
|
||||
cmd->output = fragment + cmd->x * 2;
|
||||
}
|
||||
|
||||
__attribute__((constructor))
|
||||
static void register_shader(void)
|
||||
{
|
||||
AZRP_SHADER_IMAGE_RGB16 = azrp_register_shader(shader_rgb16);
|
||||
}
|
||||
|
||||
void azrp_shader_image_rgb16_configure(void)
|
||||
{
|
||||
azrp_set_uniforms(AZRP_SHADER_IMAGE_RGB16, (void *)azrp_width);
|
||||
}
|
||||
|
||||
void azrp_image_rgb16(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
azrp_subimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void azrp_subimage_rgb16(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_RGB565A)
|
||||
return azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, false, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.loop = azrp_image_shader_rgb16_normal;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_rgb16_clearbg(int x, int y, image_t const *img, int eff, int bg)
|
||||
{
|
||||
azrp_subimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
bg);
|
||||
}
|
||||
|
||||
void azrp_subimage_rgb16_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = azrp_image_shader_rgb16_clearbg;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
.global _azrp_image_shader_rgb16_clearbg
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 CLEARBG and DYE, Azur version: by NULL canceling.
|
||||
|
||||
This function handles both CLEARBG and DYE, which happen to work identically
|
||||
on RGB16, save for the fact that the DYE loop ignores the value of opaque
|
||||
pixels and uses the dye color instead. It's one of the standard 2-unrolled
|
||||
2-stage-pipeline loops with a right edge, using NULL canceling for
|
||||
transparency.
|
||||
|
||||
r0: [temporary] (CLEARBG) or dye value (DYE)
|
||||
r7: Right edge pointer
|
||||
r8: Right edge value
|
||||
r9: Background color
|
||||
r10: Nullable output pointer
|
||||
r11: 0 (to neutralize addc during NULL-cancelling)
|
||||
r12: Right edge stride
|
||||
r13: [temporary] (one of the pixels)
|
||||
r14: [temporary] (one of the pixels in DYE)
|
||||
|
||||
The GEN_CLEARBG_LOOP macro parameters are as follows. All of them except for
|
||||
SRC1 and SRC2 are determined by HFLIP; it's just simpler to set their values
|
||||
on the macro's call site than have .if statements everywhere. This set of
|
||||
parameters is used for virtually all the functions of all the formats.
|
||||
|
||||
SRC1 and SRC2 are used in DYE mode to replace the pixel values read from
|
||||
memory with a constant register.
|
||||
|
||||
HFLIP: Whether to enable HFLIP
|
||||
OUT_DIR: Variation of r5 at each loop, either 4 or -4
|
||||
TMP1: Temporary register for first pixel
|
||||
TMP2: Temporary register for second pixel
|
||||
OFF1: Offset for first pixel write
|
||||
OFF2: Offset for second pixel write
|
||||
SRC1: Source of first write (here either TMP1 or r0)
|
||||
SRC2: Source of second write (here either TMP2 or r0) */
|
||||
|
||||
.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2, SRC1, SRC2
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shlr r2
|
||||
|
||||
mov.l r11, @-r15
|
||||
mov #0, r11
|
||||
|
||||
mov.w @r8+, r9 /* cmd.color_1 */
|
||||
shll r7
|
||||
|
||||
mov.l r10, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov.l r12, @-r15
|
||||
add #-2, r5 /* Pre-decrement, see output logic */
|
||||
|
||||
mov r2, r12
|
||||
shll2 r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add r6, r12
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
.if \HFLIP
|
||||
mov r2, r0
|
||||
shll2 r0
|
||||
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
mov.w @r8+, r0 /* cmd.color_2 */
|
||||
nop
|
||||
|
||||
START
|
||||
|
||||
mov.w @r3+, \TMP1
|
||||
nop
|
||||
|
||||
mov.w @r7, r8 /* Save right edge */
|
||||
nop
|
||||
|
||||
cmp/eq \TMP1, r9
|
||||
nop
|
||||
|
||||
2: mov #-1, r10
|
||||
addc r11, r10
|
||||
|
||||
mov.w @r3+, \TMP2
|
||||
and r5, r10
|
||||
|
||||
add #\OUT_DIR, r5
|
||||
nop
|
||||
|
||||
mov.wv \SRC1, \OFF1, r10
|
||||
cmp/eq \TMP2, r9
|
||||
|
||||
mov #-1, r10
|
||||
addc r11, r10
|
||||
|
||||
mov.w @r3+, \TMP1
|
||||
and r5, r10
|
||||
|
||||
cmp/eq \TMP1, r9
|
||||
3: mov.wv \SRC2, \OFF2, r10
|
||||
|
||||
mov.w r8, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r11
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
#ifndef AZRP_RGB16_DYE
|
||||
|
||||
_azrp_image_shader_rgb16_clearbg:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_DYE_LOOP 0, 4, r0, r13, 2, 0, r0, r13
|
||||
9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r0, 0, 2, r13, r0
|
||||
|
||||
#endif
|
|
@ -0,0 +1,12 @@
|
|||
.global _azrp_image_shader_rgb16_dye
|
||||
#define AZRP_RGB16_DYE
|
||||
#include "image_rgb16_clearbg.S"
|
||||
|
||||
/* See image_rgb16_clearbg.S for details on this function. */
|
||||
|
||||
_azrp_image_shader_rgb16_dye:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_DYE_LOOP 0, 4, r14, r13, 2, 0, r0, r0
|
||||
9: GEN_CLEARBG_DYE_LOOP 1, -4, r13, r14, 0, 2, r0, r0
|
|
@ -0,0 +1,26 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_rgb16_dye(int x, int y, image_t const *img, int eff,
|
||||
int dye_color)
|
||||
{
|
||||
azrp_subimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_rgb16_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 12;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = azrp_image_shader_rgb16_dye;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_subimage_rgb16_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int c1 = va_arg(args, int);
|
||||
int c2 = va_arg(args, int);
|
||||
azrp_subimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff, c1, c2);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
azrp_subimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
azrp_subimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
azrp_subimage_rgb16(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
.global _azrp_image_shader_rgb16_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 Opaque rendering, Azur version: by straightforward copy.
|
||||
|
||||
This function of the image renderer is designed for Azur's streaming model
|
||||
only. Unlike its RAM-model counterpart which is bottlenecked by its writing
|
||||
speed, this function is entirely limited by the CPU's ability to output the
|
||||
data in the required format.
|
||||
|
||||
In the simple case where there is no color effect and no HFLIP, the task of
|
||||
rendering a 16-bit opaque image boils down to a 2-dimensional memcpy. This
|
||||
task can be optimized by moving longwords if the source and destination and
|
||||
co-4-aligned, with four variations depending on the width and initial
|
||||
position, identified by the following parameters:
|
||||
|
||||
* w1 / w2 denotes the parity of the command width;
|
||||
* o2 / o4 denotes the alignment of the output.
|
||||
|
||||
It is easy to see that when input and output are not co-aligned, any attempt
|
||||
to combine two word reads into a single long write requires at least 3
|
||||
cycles per 2 pixels and needs parallelism over several pixels to not get
|
||||
immediately shut down by the LS-to-EX delay. Here we decide to naively copy
|
||||
by words, which achieves 4 cycles per 2 pixels, mainly because large RGB16
|
||||
images are very quickly bottlenecked in reading by their own size anyway.
|
||||
|
||||
The HFLIP version also needs to rearrange pixels, and is thus performed with
|
||||
word-based copies in all situations, which is a straightforward process. */
|
||||
|
||||
_azrp_image_shader_rgb16_normal:
|
||||
/* Not a single cycle */
|
||||
tst #1, r0
|
||||
bf _BACKWARD_WORD_COPY
|
||||
|
||||
mov #8, r0 /* Use the naive method for width ≤ 8 */
|
||||
cmp/ge r2, r0
|
||||
|
||||
bt.s _FORWARD_WORD_COPY
|
||||
nop
|
||||
|
||||
mov r5, r0 /* Check if r3 and r5 are co-aligned */
|
||||
xor r3, r0
|
||||
|
||||
/* Not a single cycle */
|
||||
tst #2, r0
|
||||
bt _FORWARD_LONG_COPY
|
||||
|
||||
_FORWARD_WORD_COPY:
|
||||
START
|
||||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
_FORWARD_LONG_COPY:
|
||||
shlr r2 /* Test width parity */
|
||||
mov #2, r0
|
||||
|
||||
bt .w1
|
||||
nop
|
||||
|
||||
.w2: tst r0, r3 /* Test alignment of input */
|
||||
bf .w2d2
|
||||
|
||||
.w2d4: START
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.w2d2: add #-1, r2
|
||||
nop
|
||||
|
||||
START
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.w1: tst r0, r3 /* Test alignment of input */
|
||||
bf .w1d2
|
||||
|
||||
.w1d4: START
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.w1d2: START
|
||||
movs.w @r3+, x0
|
||||
movs.w x0, @r5+
|
||||
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
_BACKWARD_WORD_COPY:
|
||||
mov r2, r0
|
||||
shll r0
|
||||
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
|
||||
START
|
||||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @-r5
|
||||
END
|
||||
EPILOGUE
|
|
@ -0,0 +1,116 @@
|
|||
.global _azrp_image_shader_rgb16_swapcolor
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 SWAPCOLOR, Azur version: by branchless xor selection.
|
||||
|
||||
The xor selection is explained in gint's version of P8 SWAPCOLOR. This
|
||||
version's selection is slightly simpler because we don't have to index the
|
||||
palette to find the source color. We use a 2-unrolled 2-stage-pipeline loop
|
||||
to optimize for CPU speed.
|
||||
|
||||
r7: Right edge pointer
|
||||
r8: Right edge value
|
||||
r9: cmd.color_1
|
||||
r10: Holds (x ^ y) & -(c == x) during selection
|
||||
r11: cmd.color_1 ^ cmd.color_2 (ie. x ^ y)
|
||||
r12: Right edge stride
|
||||
r13: [temporary] */
|
||||
|
||||
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shlr r2
|
||||
|
||||
mov.l r11, @-r15
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.w @r8+, r9 /* cmd.color_1 */
|
||||
shll r7
|
||||
|
||||
mov.l r10, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov.l r12, @-r15
|
||||
add #-2, r5 /* Predecrement, see output logic */
|
||||
|
||||
mov.w @r8+, r11 /* cmd.color_2 */
|
||||
mov r2, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
shll2 r12
|
||||
|
||||
add r6, r12
|
||||
nop
|
||||
|
||||
xor r9, r11
|
||||
nop
|
||||
|
||||
.if \HFLIP
|
||||
mov r2, r0
|
||||
shll2 r0
|
||||
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
mov.w @r3+, \TMP1
|
||||
nop
|
||||
|
||||
mov.w @r7, r8 /* Save right edge */
|
||||
nop
|
||||
|
||||
cmp/eq \TMP1, r9
|
||||
nop
|
||||
|
||||
2: subc r10, r10
|
||||
nop
|
||||
|
||||
and r11, r10
|
||||
mov.w @r3+, \TMP2
|
||||
|
||||
xor r10, \TMP1
|
||||
nop
|
||||
|
||||
mov.wv \TMP1 \OFF1 r5
|
||||
cmp/eq \TMP2, r9
|
||||
|
||||
add #\OUT_DIR, r5
|
||||
nop
|
||||
|
||||
subc r10, r10
|
||||
nop
|
||||
|
||||
and r11, r10
|
||||
mov.w @r3+, \TMP1
|
||||
|
||||
xor r10, \TMP2
|
||||
nop
|
||||
|
||||
cmp/eq \TMP1, r9
|
||||
3: mov.wv \TMP2 \OFF2 r5
|
||||
|
||||
mov.w r8, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r11
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_azrp_image_shader_rgb16_swapcolor:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_SWAPCOLOR_LOOP 0, 4, r0, r13, 2, 0
|
||||
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r0, 0, 2
|
|
@ -0,0 +1,51 @@
|
|||
#include <azur/gint/render.h>
|
||||
|
||||
void azrp_image_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
azrp_subimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_rgb16_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_color, int new_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_color;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = azrp_image_shader_rgb16_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
||||
void azrp_image_rgb16_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
azrp_subimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void azrp_subimage_rgb16_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(gint_image_mkcmd(&box, img, eff, false, true, &cmd, azrp_width,
|
||||
azrp_height)) {
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = azrp_image_shader_rgb16_swapcolor;
|
||||
azrp_queue_image(&box, img, &cmd);
|
||||
}
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
Loading…
Reference in New Issue