azur: progress on tex2d as bopti, custom command sorter
This commit is contained in:
parent
33e6a44578
commit
0fec6da1c4
|
@ -113,6 +113,8 @@ extern int azrp_width, azrp_height;
|
|||
extern int azrp_frag_count;
|
||||
/* Offset of first fragment. */
|
||||
extern int azrp_frag_offset;
|
||||
/* Height of fragments. */
|
||||
extern int azrp_frag_height;
|
||||
|
||||
/* azrp_config_scale(): Select the renderer's super-scaling factor
|
||||
|
||||
|
@ -178,7 +180,11 @@ extern uint8_t AZRP_SHADER_TEX2D;
|
|||
void azrp_clear(uint16_t color);
|
||||
|
||||
/* azrp_image(): Queue image command [AZRP_SHADER_TEX2D] */
|
||||
void azrp_image(int x, int y, uint16_t *pixels, int w, int h, int stride);
|
||||
void azrp_image(int x, int y, bopti_image_t const *image);
|
||||
|
||||
/* azrp_subimage(): Queue image subsection command [AZRP_SHADER_TEX2D] */
|
||||
void azrp_subimage(int x, int y, bopti_image_t const *image,
|
||||
int left, int top, int width, int height, int flags);
|
||||
|
||||
/* Functions to update uniforms for these shaders. You should call them when:
|
||||
* AZRP_SHADER_CLEAR: Changing super-scaling settings.
|
||||
|
@ -254,15 +260,14 @@ struct azrp_shader_tex2d_command {
|
|||
uint8_t fragment_id;
|
||||
/* Pixels per line */
|
||||
int16_t columns;
|
||||
/* Already offset by start row and column */
|
||||
void *input;
|
||||
/* Address of the image structure */
|
||||
bopti_image_t const *image;
|
||||
/* Destination in XRAM (offset) */
|
||||
uint16_t output;
|
||||
/* Number of lines */
|
||||
int16_t lines;
|
||||
/* Distance between two lines (columns excluded) */
|
||||
int16_t stride;
|
||||
|
||||
} GPACKED(2);
|
||||
/* Already offset by start row and column */
|
||||
void const *input;
|
||||
};
|
||||
|
||||
AZUR_END_DECLS
|
||||
|
|
|
@ -17,6 +17,10 @@ int azrp_width, azrp_height;
|
|||
/* Offset of first fragment for alignment, and number of fragments. */
|
||||
int azrp_frag_offset;
|
||||
int azrp_frag_count;
|
||||
/* Height of fragment. */
|
||||
int azrp_frag_height;
|
||||
|
||||
/* TODO: Either make command queue private or use azrp_ prefix */
|
||||
|
||||
/* Number and total size of queued commands. */
|
||||
GXRAM int commands_count = 0, commands_length = 0;
|
||||
|
@ -48,26 +52,45 @@ void azrp_clear_commands(void)
|
|||
commands_length = 0;
|
||||
}
|
||||
|
||||
static int compare_commands(void const *c1, void const *c2)
|
||||
/* Custom quick sort for commands */
|
||||
|
||||
static inline int compare(int8_t *c1, int8_t *c2)
|
||||
{
|
||||
uint16_t offset1 = *(uint16_t *)c1;
|
||||
uint16_t offset2 = *(uint16_t *)c2;
|
||||
int d = c1[1] - c2[1];
|
||||
return (d ? d : c1 - c2);
|
||||
}
|
||||
|
||||
uint8_t *ptr1 = (uint8_t *)(0xe5017000 + offset1);
|
||||
uint8_t *ptr2 = (uint8_t *)(0xe5017000 + offset2);
|
||||
static void cmdsort(int low, int high)
|
||||
{
|
||||
if(low >= high) return;
|
||||
|
||||
int diff_fragments = (int)ptr1[1] - (int)ptr2[1];
|
||||
if(diff_fragments) return diff_fragments;
|
||||
int8_t *pivot = YRAM + commands_array[(low + high) >> 1];
|
||||
|
||||
return (int)offset1 - (int)offset2;
|
||||
int i = low - 1;
|
||||
int j = high + 1;
|
||||
|
||||
while(1) {
|
||||
do i++;
|
||||
while(compare(YRAM + commands_array[i], pivot) < 0);
|
||||
|
||||
do j--;
|
||||
while(compare(YRAM + commands_array[j], pivot) > 0);
|
||||
|
||||
if(i >= j) break;
|
||||
|
||||
uint16_t tmp = commands_array[i];
|
||||
commands_array[i] = commands_array[j];
|
||||
commands_array[j] = tmp;
|
||||
}
|
||||
|
||||
cmdsort(low, j);
|
||||
cmdsort(j+1, high);
|
||||
}
|
||||
|
||||
void azrp_sort_commands(void)
|
||||
{
|
||||
prof_enter(azrp_perf_sort);
|
||||
/* TODO: azrp_sort_commands: Use a custom sorter */
|
||||
qsort(commands_array, commands_count, sizeof commands_array[0],
|
||||
compare_commands);
|
||||
cmdsort(0, commands_count - 1);
|
||||
prof_leave(azrp_perf_sort);
|
||||
}
|
||||
|
||||
|
@ -95,6 +118,7 @@ void azrp_render_fragments(void)
|
|||
}
|
||||
else {
|
||||
prof_enter(azrp_perf_r61524);
|
||||
/* TODO: Consider xram_frame() by DMA in parallel? */
|
||||
xram_frame(azrp_frag, 396 * 8);
|
||||
prof_leave(azrp_perf_r61524);
|
||||
frag++;
|
||||
|
@ -129,11 +153,11 @@ static void update_frag_count(void)
|
|||
static void update_size(void)
|
||||
{
|
||||
if(azrp_scale == 1)
|
||||
azrp_width = 396, azrp_height = 198;
|
||||
azrp_width = 396, azrp_height = 198, azrp_frag_height = 8;
|
||||
else if(azrp_scale == 2)
|
||||
azrp_width = 198, azrp_height = 112;
|
||||
azrp_width = 198, azrp_height = 112, azrp_frag_height = 16;
|
||||
else if(azrp_scale == 3)
|
||||
azrp_width = 132, azrp_height = 75;
|
||||
azrp_width = 132, azrp_height = 75, azrp_frag_height = 16;
|
||||
}
|
||||
|
||||
void azrp_config_scale(int scale)
|
||||
|
|
|
@ -11,15 +11,7 @@ static void register_shader(void)
|
|||
|
||||
void azrp_shader_clear_configure(void)
|
||||
{
|
||||
int longs_in_fragment = 0;
|
||||
|
||||
if(azrp_scale == 1)
|
||||
longs_in_fragment = (396 * 2) * 8 / 4;
|
||||
else if(azrp_scale == 2)
|
||||
longs_in_fragment = (198 * 2) * 16 / 4;
|
||||
else if(azrp_scale == 3)
|
||||
longs_in_fragment = (132 * 2) * 16 / 4;
|
||||
|
||||
int longs_in_fragment = (azrp_width * azrp_frag_height / 2);
|
||||
azrp_set_uniforms(AZRP_SHADER_CLEAR, (void *)longs_in_fragment);
|
||||
}
|
||||
|
||||
|
@ -35,6 +27,8 @@ struct command {
|
|||
|
||||
void azrp_clear(uint16_t color)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
|
||||
struct command cmd;
|
||||
cmd.shader_id = AZRP_SHADER_CLEAR;
|
||||
cmd.color = color;
|
||||
|
@ -43,4 +37,6 @@ void azrp_clear(uint16_t color)
|
|||
cmd.fragment_id = i;
|
||||
azrp_queue_command(&cmd, sizeof cmd);
|
||||
}
|
||||
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
|
|
@ -1,58 +1,70 @@
|
|||
.global _azrp_shader_tex2d
|
||||
.align 4
|
||||
|
||||
/* Profile values from bopti */
|
||||
#define PX_RGB565 0
|
||||
#define PX_RGB565A 1
|
||||
#define PX_P8 2
|
||||
#define PX_P4 3
|
||||
|
||||
/* Register assignment
|
||||
r0: (temporary)
|
||||
r1: Lines
|
||||
r2: Output
|
||||
r3: Input
|
||||
r4: Output stride (initially uniform: azrp_width*2)
|
||||
r5: Command queue; (temporary)
|
||||
r6: (temporary) (initially azrp_frag)
|
||||
r4: [parameter] azrp_width*2; output stride
|
||||
r5: [parameter] Command queue; (temporary)
|
||||
r6: [parameter] azrp_frag; (temporary)
|
||||
r7: Columns
|
||||
r8: Input stride */
|
||||
r8: Input stride
|
||||
r9: Image profile */
|
||||
_azrp_shader_tex2d:
|
||||
mov.l r8, @-r15
|
||||
add #2, r5
|
||||
|
||||
mov.w @r5+, r7 /* Columns */
|
||||
mov.l r9, @-r15
|
||||
|
||||
mov.l r8, @-r15
|
||||
mov.w @r5+, r7 /* command.columns */
|
||||
|
||||
mov.w @r5+, r0 /* Input (1/2) */
|
||||
mov.l @r5+, r8 /* command.image */
|
||||
|
||||
mov.w @r5+, r2 /* command.output (offset) */
|
||||
sub r7, r4
|
||||
|
||||
mov.w @r5+, r3 /* Input (2/2) */
|
||||
mov.w @r5+, r1 /* command.lines */
|
||||
sub r7, r4
|
||||
|
||||
mov.w @r5+, r2 /* Output offset */
|
||||
|
||||
mov.w @r5+, r1 /* Lines */
|
||||
shll16 r3
|
||||
|
||||
xtrct r0, r3
|
||||
|
||||
mov.w @r5+, r8 /* Input stride */
|
||||
mov #8, r0 /* Maximum width for naive method */
|
||||
|
||||
mov.w @r8+, r0 /* image.profile */
|
||||
add r6, r2
|
||||
cmp/ge r7, r0
|
||||
|
||||
bt.s .naive
|
||||
mov #2, r0
|
||||
mov.w @r8+, r6 /* image.alpha */
|
||||
cmp/eq #PX_P4, r0
|
||||
|
||||
/* The following variations are named based on the parity of each parameter:
|
||||
* w[eo] (width even, width odd)
|
||||
* d[eo] (data even, data odd)
|
||||
where even/odd means 4-aligned/2-aligned in terms of pointers.
|
||||
mov.w @r8, r8 /* image.width */
|
||||
|
||||
When the destination and source have identical parity, the copy is pretty
|
||||
direct and takes 2 cycles to copy 4 bytes. When they have opposite parity
|
||||
however, longwords need to be rearranged, which is a problem: arithmetic
|
||||
operations under a RAW dependency take 3 cycles, so there's no way to
|
||||
complete the 4-byte copy in less than 4 cycles unless iterations are opened
|
||||
and weaved, which would add too much sub-cases. So in this case the naive
|
||||
method that copies 4 bytes in 4 cycles is used. A very heavy image renderer
|
||||
like a tileset shader should consider the optimized route though. */
|
||||
mov.l @r5+, r3 /* command.input (pointer) */
|
||||
|
||||
sub r7, r8
|
||||
|
||||
bt.s .format_P4
|
||||
shll r8
|
||||
|
||||
cmp/eq #PX_P8, r0
|
||||
|
||||
bt .format_P8
|
||||
cmp/eq #PX_RGB565A, r0
|
||||
|
||||
bt .format_RGB565A
|
||||
|
||||
/* Default below is .format_RGB565 */
|
||||
|
||||
/* [Loop macros]
|
||||
|
||||
The following macros implement the main loop of the image renderer.
|
||||
* Each line is rendered in the tight loop between 2: and 3: (both included).
|
||||
* r2 is the output (with stride r4, in bytes)
|
||||
* r3 is the input (with stride r8, in bytes)
|
||||
* There are r1 rows with r7 iterations each */
|
||||
|
||||
#define TEX2D_START() \
|
||||
ldrs 2f; \
|
||||
|
@ -66,10 +78,41 @@ _azrp_shader_tex2d:
|
|||
bf.s 1b; \
|
||||
add r8, r3; \
|
||||
\
|
||||
mov.l @r15+, r9; \
|
||||
rts; \
|
||||
mov.l @r15+, r8
|
||||
|
||||
.case_analysis:
|
||||
/* [Rendering strategy for the RGB565 format]
|
||||
|
||||
In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
|
||||
optimize by moving longwords. Since longwords are pairs of pixels, there are
|
||||
variations and subcases based on the parity of each parameter:
|
||||
|
||||
* w[eo] denotes whether the width of the image is even or odd;
|
||||
* d[eo] denotes whether the memory accesses to the source and destination
|
||||
are even (4-aligned) or odd (2-aligned).
|
||||
|
||||
When the destination and source have identical parity, the d[eo] variation
|
||||
can be defined. In this case the copy is pretty direct, it's a longword copy
|
||||
and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
|
||||
start or end address if 2-aligned.
|
||||
|
||||
However, when they have opposite parity, each longword read matches up with
|
||||
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
|
||||
not help because of the stall cycle between loading a register and using it
|
||||
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
|
||||
the word-based copy). Weaving iterations could help but would be too complex
|
||||
here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
|
||||
tileset shader) should aim for that route though. Also, movua.l followed by
|
||||
mov.l is even slower (5 cycles). */
|
||||
|
||||
.format_RGB565:
|
||||
mov #8, r0 /* Maximum width for naive method */
|
||||
cmp/ge r7, r0
|
||||
|
||||
bt.s .naive
|
||||
mov #2, r0
|
||||
|
||||
/* Use naive method for opposite source/destination parity */
|
||||
mov r2, r6
|
||||
xor r3, r6
|
||||
|
@ -131,3 +174,37 @@ _azrp_shader_tex2d:
|
|||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @r2+
|
||||
TEX2D_END()
|
||||
|
||||
/* [Rendering strategy for the RGB565A format]
|
||||
|
||||
Since we have to check for the alpha value in each pixel, there's really no
|
||||
longword-based optimization. Instead, we just go as fast as possible with
|
||||
each pixels, using DSP instructions. Branchless jump is pretty useful.
|
||||
|
||||
TODO: Opening iterations will definitely save at least 1 cycle per pixel; it
|
||||
just requires a subcase for extremely small images (width = 1). */
|
||||
|
||||
.format_RGB565A:
|
||||
mov r2, r5
|
||||
|
||||
TEX2D_START()
|
||||
/* In the comparison, DC=1 if x0 == image.alpha */
|
||||
2: movs.w @r3+, x0
|
||||
pcmp x0, y0 movx.w @r5, x1
|
||||
dct pcopy x1, x0
|
||||
3: movx.w x0, @r5+
|
||||
TEX2D_END()
|
||||
|
||||
/* [Rendering strategy for the P8 format] */
|
||||
.format_P8:
|
||||
TEX2D_START()
|
||||
2:
|
||||
3:
|
||||
TEX2D_END()
|
||||
|
||||
/* [Rendering strategy for the P4 format] */
|
||||
.format_P4:
|
||||
TEX2D_START()
|
||||
2:
|
||||
3:
|
||||
TEX2D_END()
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include <gint/defs/util.h>
|
||||
|
||||
uint8_t AZRP_SHADER_TEX2D = -1;
|
||||
|
||||
|
@ -15,3 +16,54 @@ void azrp_shader_tex2d_configure(void)
|
|||
}
|
||||
|
||||
//---
|
||||
|
||||
/* Profile values from bopti */
|
||||
#define PX_RGB565 0
|
||||
#define PX_RGB565A 1
|
||||
#define PX_P8 2
|
||||
#define PX_P4 3
|
||||
|
||||
void azrp_image(int x, int y, bopti_image_t const *image)
|
||||
{
|
||||
azrp_subimage(x, y, image, 0, 0, image->width, image->height, 0);
|
||||
}
|
||||
|
||||
void azrp_subimage(int x, int y, bopti_image_t const *image,
|
||||
int left, int top, int width, int height, int flags)
|
||||
{
|
||||
prof_enter(azrp_perf_cmdgen);
|
||||
|
||||
if(!(flags & DIMAGE_NOCLIP)) {
|
||||
/* TODO: tex2d: clip function */
|
||||
}
|
||||
|
||||
struct azrp_shader_tex2d_command cmd;
|
||||
cmd.shader_id = AZRP_SHADER_TEX2D;
|
||||
cmd.columns = width;
|
||||
cmd.image = image;
|
||||
|
||||
int input_multiplier = 1;
|
||||
if(image->profile == PX_P8) input_multiplier = 0;
|
||||
if(image->profile == PX_P4) input_multiplier = -1;
|
||||
|
||||
/* This divides by azrp_frag_height */
|
||||
cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
|
||||
|
||||
while(height > 0) {
|
||||
cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
|
||||
|
||||
int input_offset = (image->width * top + left) << input_multiplier;
|
||||
cmd.input = (void *)image->data + input_offset;
|
||||
|
||||
cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
|
||||
|
||||
y += cmd.lines;
|
||||
top += cmd.lines;
|
||||
height -= cmd.lines;
|
||||
|
||||
azrp_queue_command(&cmd, sizeof cmd);
|
||||
cmd.fragment_id++;
|
||||
}
|
||||
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue