forked from Lephenixnoir/Azur
azur: documentation and optimization on rendering
This commit is contained in:
parent
c5cdb2b885
commit
911cc8e5ac
|
@ -10,7 +10,7 @@ if("${FXSDK_PLATFORM_LONG}" STREQUAL fxCG50)
|
|||
endif()
|
||||
|
||||
# General options
|
||||
add_compile_options(-Wall -Wextra -O2
|
||||
add_compile_options(-Wall -Wextra -O3
|
||||
-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
|
@ -60,7 +60,7 @@ if(FACETS_PLATFORM STREQUAL emscripten)
|
|||
|
||||
set(PORTS -sUSE_SDL=2 -sUSE_SDL_IMAGE=2 -sSDL2_IMAGE_FORMATS=["png"])
|
||||
add_compile_options(${PORTS})
|
||||
add_link_options(${PORTS} -O2)
|
||||
add_link_options(${PORTS} -O3)
|
||||
endif()
|
||||
|
||||
#---
|
||||
|
|
|
@ -49,7 +49,7 @@ extern uint16_t azrp_frag[];
|
|||
|
||||
/* Maximum number of commands that can be queued. (This is only one of two
|
||||
limits, the other being the size of the command data.) */
|
||||
#define AZRP_MAX_COMMANDS 512
|
||||
#define AZRP_MAX_COMMANDS 256
|
||||
|
||||
/* Maximum number of shaders that can be defined. (This is a loose limit). */
|
||||
#define AZRP_MAX_SHADERS 32
|
||||
|
@ -212,8 +212,8 @@ extern prof_t azrp_perf_shaders;
|
|||
/* This counter runs during CPU transfers to the R61524 display. */
|
||||
extern prof_t azrp_perf_r61524;
|
||||
|
||||
/* This counter runs during the whole azrp_update() operation; it is the sum of
|
||||
sort, shaders, r61524, plus some logic overhead. */
|
||||
/* This counter runs during rendering; it is the sum of shaders and r61524,
|
||||
plus some logic overhead. */
|
||||
extern prof_t azrp_perf_render;
|
||||
|
||||
/* azrp_perf_clear(): Clear all performance counters
|
||||
|
@ -243,12 +243,10 @@ void azrp_set_uniforms(int shader_id, void *uniforms);
|
|||
|
||||
/* azrp_queue_command(): Add a new command to be rendered next frame
|
||||
|
||||
The command must be a structure starting with an 8-bit shader ID and an
|
||||
8-bit fragment ID.
|
||||
|
||||
Returns true on success, false if the maximum amount of commands or command
|
||||
memory is exceeded. */
|
||||
bool azrp_queue_command(void *command, size_t size);
|
||||
The command must be a structure starting with an 8-bit shader ID. Returns
|
||||
true on success, false if the maximum amount of commands or command memory
|
||||
is exceeded. */
|
||||
bool azrp_queue_command(void *command, size_t size, int fragment);
|
||||
|
||||
//---
|
||||
// Internal shader definitions (for reference; no API guarantee)
|
||||
|
|
|
@ -8,7 +8,8 @@
|
|||
|
||||
#define YRAM ((void *)0xe5017000)
|
||||
|
||||
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM. */
|
||||
/* 8 rows of video memory, occupying 6338/8192 bytes of XRAM.
|
||||
TODO: Extend this to 16 rows, and move the rest to RAM */
|
||||
GXRAM GALIGNED(32) uint16_t azrp_frag[DWIDTH * 8];
|
||||
|
||||
/* Super-scaling factor, width and height of output. */
|
||||
|
@ -26,11 +27,11 @@ int azrp_frag_height;
|
|||
GXRAM int commands_count = 0, commands_length = 0;
|
||||
|
||||
/* Array of pointers to queued commands (stored as an offset into YRAM). */
|
||||
GXRAM uint16_t commands_array[AZRP_MAX_COMMANDS];
|
||||
GXRAM uint32_t commands_array[AZRP_MAX_COMMANDS];
|
||||
|
||||
/* Array of shader programs and uniforms. */
|
||||
static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
|
||||
static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
|
||||
GXRAM static azrp_shader_t *shaders[AZRP_MAX_SHADERS] = { NULL };
|
||||
GXRAM static void *shader_uniforms[AZRP_MAX_SHADERS] = { NULL };
|
||||
|
||||
/* Next free index in the shader program array. */
|
||||
GXRAM static uint16_t shaders_next = 0;
|
||||
|
@ -54,31 +55,25 @@ void azrp_clear_commands(void)
|
|||
|
||||
/* Custom quick sort for commands */
|
||||
|
||||
static inline int compare(int8_t *c1, int8_t *c2)
|
||||
{
|
||||
int d = c1[1] - c2[1];
|
||||
return (d ? d : c1 - c2);
|
||||
}
|
||||
|
||||
static void cmdsort(int low, int high)
|
||||
{
|
||||
if(low >= high) return;
|
||||
|
||||
int8_t *pivot = YRAM + commands_array[(low + high) >> 1];
|
||||
uint32_t pivot = commands_array[(low + high) >> 1];
|
||||
|
||||
int i = low - 1;
|
||||
int j = high + 1;
|
||||
|
||||
while(1) {
|
||||
do i++;
|
||||
while(compare(YRAM + commands_array[i], pivot) < 0);
|
||||
while(commands_array[i] < pivot);
|
||||
|
||||
do j--;
|
||||
while(compare(YRAM + commands_array[j], pivot) > 0);
|
||||
while(commands_array[j] > pivot);
|
||||
|
||||
if(i >= j) break;
|
||||
|
||||
uint16_t tmp = commands_array[i];
|
||||
uint32_t tmp = commands_array[i];
|
||||
commands_array[i] = commands_array[j];
|
||||
commands_array[j] = tmp;
|
||||
}
|
||||
|
@ -89,44 +84,48 @@ static void cmdsort(int low, int high)
|
|||
|
||||
void azrp_sort_commands(void)
|
||||
{
|
||||
prof_enter(azrp_perf_sort);
|
||||
prof_enter_norec(azrp_perf_sort);
|
||||
cmdsort(0, commands_count - 1);
|
||||
prof_leave(azrp_perf_sort);
|
||||
prof_leave_norec(azrp_perf_sort);
|
||||
}
|
||||
|
||||
int azrp_commands_total;
|
||||
|
||||
void azrp_render_fragments(void)
|
||||
{
|
||||
prof_enter(azrp_perf_render);
|
||||
prof_enter_norec(azrp_perf_render);
|
||||
|
||||
azrp_commands_total = 0;
|
||||
|
||||
int i = 0;
|
||||
int frag = 0;
|
||||
uint32_t next_frag_threshold = (frag + 1) << 16;
|
||||
uint32_t cmd = commands_array[i];
|
||||
|
||||
uint8_t *cmd = (uint8_t *)YRAM + commands_array[i];
|
||||
|
||||
prof_enter(azrp_perf_r61524);
|
||||
prof_enter_norec(azrp_perf_r61524);
|
||||
r61524_start_frame(0, 244);
|
||||
prof_leave(azrp_perf_r61524);
|
||||
prof_leave_norec(azrp_perf_r61524);
|
||||
|
||||
while(1) {
|
||||
if(cmd[1] == frag) {
|
||||
if(shaders[cmd[0]]) {
|
||||
prof_enter(azrp_perf_shaders);
|
||||
shaders[cmd[0]](shader_uniforms[cmd[0]], cmd, azrp_frag);
|
||||
prof_leave(azrp_perf_shaders);
|
||||
}
|
||||
cmd = YRAM + commands_array[++i];
|
||||
}
|
||||
else {
|
||||
prof_enter(azrp_perf_r61524);
|
||||
/* TODO: Consider xram_frame() by DMA in parallel? */
|
||||
xram_frame(azrp_frag, 396 * 8);
|
||||
prof_leave(azrp_perf_r61524);
|
||||
frag++;
|
||||
if(frag >= azrp_frag_count) break;
|
||||
while(cmd < next_frag_threshold && i < commands_count) {
|
||||
azrp_commands_total++;
|
||||
uint8_t *data = (uint8_t *)YRAM + (cmd & 0xffff);
|
||||
prof_enter_norec(azrp_perf_shaders);
|
||||
shaders[data[0]](shader_uniforms[data[0]], data, azrp_frag);
|
||||
prof_leave_norec(azrp_perf_shaders);
|
||||
cmd = commands_array[++i];
|
||||
}
|
||||
|
||||
/* TODO: Consider xram_frame() by DMA in parallel? */
|
||||
prof_enter_norec(azrp_perf_r61524);
|
||||
xram_frame(azrp_frag, 396 * 8);
|
||||
prof_leave_norec(azrp_perf_r61524);
|
||||
|
||||
if(++frag >= azrp_frag_count) break;
|
||||
next_frag_threshold += (1 << 16);
|
||||
}
|
||||
|
||||
prof_leave(azrp_perf_render);
|
||||
prof_leave_norec(azrp_perf_render);
|
||||
}
|
||||
|
||||
void azrp_update(void)
|
||||
|
@ -210,7 +209,7 @@ void azrp_set_uniforms(int shader_id, void *uniforms)
|
|||
shader_uniforms[shader_id] = uniforms;
|
||||
}
|
||||
|
||||
bool azrp_queue_command(void *command, size_t size)
|
||||
bool azrp_queue_command(void *command, size_t size, int fragment)
|
||||
{
|
||||
if(commands_count >= AZRP_MAX_COMMANDS)
|
||||
return false;
|
||||
|
@ -223,7 +222,8 @@ bool azrp_queue_command(void *command, size_t size)
|
|||
for(size_t i = 0; i < size; i++)
|
||||
dst[i] = src[i];
|
||||
|
||||
commands_array[commands_count++] = commands_length;
|
||||
commands_array[commands_count++] =
|
||||
(fragment << 16) | commands_length;
|
||||
commands_length += size;
|
||||
|
||||
return true;
|
||||
|
|
|
@ -1,3 +1,18 @@
|
|||
/* Azur's built-in shaders: <clear>
|
||||
|
||||
By far the easiest of all. The clear shader is a good benchmark for how fast
|
||||
the rendering pipeline is. And it's pretty damn fast, clocking in at 400 µs
|
||||
for a full-resolution 396x224 clear (compared to 6.1 ms for a VRAM clear by
|
||||
CPU or 2.5 ms by DMA).
|
||||
|
||||
Because this is performed by CPU and therefore versatile, there are many
|
||||
variations with more complex patterns that will perform at the same speed.
|
||||
A gray tiled background for transparency in image viewing comes to mind, for
|
||||
example.
|
||||
|
||||
Affected region: full-screen
|
||||
Asymptotic performance: 0.5 cycle/pixel */
|
||||
|
||||
.global _azrp_shader_clear
|
||||
.align 4
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ void azrp_clear(uint16_t color)
|
|||
|
||||
for(int i = 0; i < azrp_frag_count; i++) {
|
||||
cmd.fragment_id = i;
|
||||
azrp_queue_command(&cmd, sizeof cmd);
|
||||
azrp_queue_command(&cmd, sizeof cmd, i);
|
||||
}
|
||||
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
|
|
|
@ -1,3 +1,77 @@
|
|||
/* Azur's built-in shaders: <image>
|
||||
|
||||
If there ever was a fantastic piece of assembler engineering in my work up
|
||||
to this point, this would be it. Every trick in the book is used here, from
|
||||
clever instruction combinations, pipeline flow and tricky DSP abuse all the
|
||||
way up to memory layout planning, transforms on loop structures, and most
|
||||
critically superscalar parallelism.
|
||||
|
||||
While the performance of the shader is not *strictly* proportional to the
|
||||
speed of the tightest loop, it's very close. The use of operand-bus XRAM for
|
||||
graphics data, systematic alignment, and detailed pipeline stalling
|
||||
measurements for common instruction sequences in gintctl allow very accurate
|
||||
speed predictions to be made based on the tightness of the code.
|
||||
|
||||
The palette formats of bopti have been refined for the purpose of this
|
||||
shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
|
||||
and P4 being renamed P4_RGB565A with minimal changes along with a variation
|
||||
aptly named P4_RGB565.
|
||||
|
||||
The asymptotic performance for each format is as follows:
|
||||
* RGB565: 1 cycle/pixel if source and destination align
|
||||
2 cycles/pixel otherwise
|
||||
* RGB565A: 4 cycles/pixel
|
||||
* P8_RGB565A: 4.5 cycles/pixel
|
||||
* P8_RGB565: 3 cycles/pixel
|
||||
* P4_RGB565A: 5 cycles/pixel
|
||||
* P4_RGB565: 3.5 cycles/pixel
|
||||
|
||||
Entirely documenting this code would take me hours, but some elements are
|
||||
provided in the comments. Superscalar parallelism is most easily appreciated
|
||||
by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
|
||||
structural technique at play in this code is loop transforms.
|
||||
|
||||
Basically, a loop that loads a pixel, performs computations with it, and
|
||||
writes the result is inefficient because of the RAW dependencies on most
|
||||
operations (with full stall cycles between loads and computations, and
|
||||
between computations and uses as addresses). Well-established loop
|
||||
optimization literature has lots of techniques to help with this problem,
|
||||
and I use two here:
|
||||
|
||||
* _Pipelining_ the loop consists in handling a single pixel over several
|
||||
iterations by doing a little bit of work in each iteration. The data for
|
||||
the pixel would move from register to register at each iteration, with the
|
||||
loop code doing one stage's worth of computation on each register. (You
|
||||
can view it as a diagonal iteration pattern in the pixel*instruction grid
|
||||
if you like such visualizations.)
|
||||
|
||||
By increasing the number of pixels in the pipeline, a lot of independent
|
||||
data can be obtained, reducing dependency pressure and allowing for
|
||||
greater parallelism at the cost of more registers being used.
|
||||
|
||||
The use of pipelining in this shader is very modest, with 2 stages at
|
||||
most, and usually only a couple of instructions being performed in advance
|
||||
for the next pixel while the current one finishes processing. Register
|
||||
assignments have some subtleties though since pressure is high overall.
|
||||
|
||||
* _Unrolling_ iterations of the loop consists in loading two (or more)
|
||||
pixels at the start of each iteration so that we can work on one while
|
||||
waiting for stalls and dependencies on the other.
|
||||
|
||||
Unlike pipelining, a loop iteration starts and ends with full pixels and
|
||||
no work carries between iterations. Unrolling allows different pixels to
|
||||
use different registers and generally better optimize the instruction
|
||||
sequence, at the cost of only supporting pixel counts that are multipes of
|
||||
the unrolling level.
|
||||
|
||||
Handling non-multiple sizes is the everlasting bane of unrolled loops,
|
||||
sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
|
||||
to only handle even sizes and neutralize unwanted pixels after the fact.
|
||||
|
||||
Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
|
||||
almost all formats (except RGB556A which performs DSP trickery).
|
||||
*/
|
||||
|
||||
.global _azrp_shader_image
|
||||
.align 4
|
||||
|
||||
|
@ -64,19 +138,20 @@ _azrp_shader_image:
|
|||
* r3 is the input (with stride r9, in bytes)
|
||||
* There are r1 rows with r7 iterations each */
|
||||
|
||||
#define START() \
|
||||
#define START() \
|
||||
nop; /* 4-alignment */ \
|
||||
ldrs 2f; \
|
||||
ldre 3f; \
|
||||
1: ldrc r7
|
||||
|
||||
#define END_NORET() \
|
||||
#define END_NORET() \
|
||||
dt r1; \
|
||||
add r4, r5; \
|
||||
bf.s 1b; \
|
||||
add r9, r3
|
||||
|
||||
#define END() \
|
||||
END_NORET(); \
|
||||
#define END() \
|
||||
END_NORET(); \
|
||||
mov.l @r15+, r9; \
|
||||
rts; \
|
||||
mov.l @r15+, r8
|
||||
|
@ -100,10 +175,10 @@ _azrp_shader_image:
|
|||
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
|
||||
not help because of the stall cycle between loading a register and using it
|
||||
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
|
||||
the word-based copy). Weaving iterations could help but would be too complex
|
||||
here (adding sub-cases); a super-heavy renderer with more hypotheses (like a
|
||||
tileset shader) should aim for that route though. Also, movua.l followed by
|
||||
mov.l is even slower (5 cycles). */
|
||||
the word-based copy). Unrolling iterations could help but would be too
|
||||
complex here (adding sub-cases); a super-heavy renderer with more hypotheses
|
||||
(like a tileset shader) should aim for that route though. Also, movua.l
|
||||
followed by mov.l is even slower (5 cycles). */
|
||||
.align 4
|
||||
_RGB565:
|
||||
mov #8, r0 /* Maximum width for naive method */
|
||||
|
@ -130,12 +205,14 @@ _RGB565.we:
|
|||
tst r0, r5
|
||||
bf _RGB565.we_do
|
||||
|
||||
/* This is 4-aligned */
|
||||
_RGB565.we_de:
|
||||
START()
|
||||
2: movs.l @r3+, x0
|
||||
3: movs.l x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.we_do:
|
||||
add #-1, r7
|
||||
|
||||
|
@ -150,6 +227,7 @@ _RGB565.we_do:
|
|||
movs.w x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.wo:
|
||||
tst r0, r5
|
||||
bf _RGB565.wo_do
|
||||
|
@ -163,6 +241,7 @@ _RGB565.wo_de:
|
|||
movs.w x0, @r5+
|
||||
END()
|
||||
|
||||
.align 4
|
||||
_RGB565.wo_do:
|
||||
START()
|
||||
movs.w @r3+, x0
|
||||
|
@ -173,6 +252,7 @@ _RGB565.wo_do:
|
|||
END()
|
||||
|
||||
/* Naive method for small widths and opposite source/destination parity */
|
||||
.align 4
|
||||
_RGB565.naive:
|
||||
START()
|
||||
2: movs.w @r3+, x0
|
||||
|
@ -210,12 +290,13 @@ _RGB565A:
|
|||
|
||||
The work needed for each pixel gets more difficult as we go, with alpha
|
||||
being the major culprit due to its additional comparisons, jumps, and
|
||||
limited interweaving opportunities due to conditionally-executed code.
|
||||
limited optimization opportunities when unrolling due to conditionally-
|
||||
executed code.
|
||||
|
||||
Because arithmetic is unavoidable and there are 1-cycle delays between both
|
||||
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2 interwoven
|
||||
iterations with an open structure. This fills the stall cycles and increases
|
||||
parallelism significantly. Pure interweaving handbook.
|
||||
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
|
||||
iterations with a 2-stage pipeline structure. This fills the stall cycles
|
||||
and increases parallelism significantly. Pure loop optimization handbook.
|
||||
|
||||
Dealing with odd widths is a major pain as usual. Instead of adding logic to
|
||||
handle the extra pixel separately, this routine lets the loop overwrite it,
|
||||
|
@ -252,7 +333,7 @@ _P8_RGB565A:
|
|||
sub r7, r9
|
||||
|
||||
mov r7, r13
|
||||
add #-2, r9 /* Input stride compensation for openness */
|
||||
add #-2, r9 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.l r12, @-r15
|
||||
shlr r7
|
||||
|
@ -281,7 +362,6 @@ _P8_RGB565A:
|
|||
shll2 r2
|
||||
|
||||
add r4, r2
|
||||
nop /* 4-alignment */
|
||||
|
||||
START()
|
||||
|
||||
|
@ -293,7 +373,7 @@ _P8_RGB565A:
|
|||
mov.b @r3+, r10
|
||||
tst r6, r6
|
||||
|
||||
/* 2-interwoven open main loop */
|
||||
/* 2-unrolled 2-stage main loop */
|
||||
2: add r6, r6
|
||||
mov r6, r0
|
||||
|
||||
|
@ -346,7 +426,7 @@ _P8_RGB565:
|
|||
sub r7, r9
|
||||
|
||||
mov r7, r13
|
||||
add #-2, r9 /* Input stride compensation for openness */
|
||||
add #-2, r9 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.l r12, @-r15
|
||||
shlr r7
|
||||
|
@ -375,7 +455,6 @@ _P8_RGB565:
|
|||
shll2 r2
|
||||
|
||||
add r4, r2
|
||||
nop /* 4-alignment */
|
||||
|
||||
START()
|
||||
|
||||
|
@ -387,11 +466,14 @@ _P8_RGB565:
|
|||
mov.b @r3+, r10
|
||||
shll r0
|
||||
|
||||
/* 2-interwoven open main loop */
|
||||
/* 2-unrolled 2-stage main loop */
|
||||
2: mov.b @r3+, r6
|
||||
shll r10
|
||||
|
||||
mov.w @(r0,r8), r0
|
||||
/* This nop is not for show, it actually prevents the loop from slowing
|
||||
down to 7 cycles /i, probably due to instruction reads alignment. */
|
||||
nop
|
||||
|
||||
mov.w r0, @(4,r5)
|
||||
mov r10, r0
|
||||
|
@ -434,8 +516,8 @@ _P8_RGB565.palette_distance:
|
|||
The special nature of the nibble packing means the simplest loop form writes
|
||||
2 pixels from a 2-aligned source image position in a single iteration. Other
|
||||
structures don't even come close: selecting nibbles individually is folly,
|
||||
while not interweaving is inefficient. So the whole point of this routine is
|
||||
to forcibly align the subimage on a byte-aligned and never break that grid.
|
||||
while not unrolling is inefficient. So the whole point of this routine is to
|
||||
forcibly align the subimage on a byte-aligned and never break that grid.
|
||||
|
||||
The command builder for P4 does this alignment before submitting the
|
||||
command. Obviously the transform can cause one extra pixel to be overridden
|
||||
|
@ -443,7 +525,7 @@ _P8_RGB565.palette_distance:
|
|||
offsets indicating pixels to preserve at each end. When overwrites occurs,
|
||||
the edge offsets point to the overwritten pixels so they can be restored.
|
||||
Otherwise, they point to the next pixels and the restores are no-ops. See
|
||||
the strategy used for managing interweaving in P8 formats for details.
|
||||
the strategy used for managing unrolling in P8 formats for details.
|
||||
|
||||
The only irregularity is image width, which the command builder cannot
|
||||
modify. It is rounded up to the next multiple of 2, then halved. There is a
|
||||
|
@ -466,10 +548,10 @@ _P4_RGB565A:
|
|||
mov.l r12, @-r15
|
||||
sub r7, r9
|
||||
|
||||
mov.w @r2+, r11 /* command.edge1 */
|
||||
add #2, r8 /* image.palette */
|
||||
mov.w @r2+, r11 /* command.edge1 */
|
||||
add #2, r8 /* image.palette */
|
||||
|
||||
mov.w @r2+, r12 /* command.edge2 */
|
||||
mov.w @r2+, r12 /* command.edge2 */
|
||||
mov r5, r0
|
||||
|
||||
mov.l r13, @-r15
|
||||
|
@ -479,6 +561,7 @@ _P4_RGB565A:
|
|||
shll r12
|
||||
|
||||
add #-4, r5
|
||||
nop /* 4-alignment */
|
||||
|
||||
START()
|
||||
|
||||
|
@ -559,10 +642,10 @@ _P4_RGB565:
|
|||
mov.l r12, @-r15
|
||||
sub r7, r9
|
||||
|
||||
mov.w @r2+, r11 /* command.edge1 */
|
||||
add #2, r8 /* image.palette */
|
||||
mov.w @r2+, r11 /* command.edge1 */
|
||||
add #2, r8 /* image.palette */
|
||||
|
||||
mov.w @r2+, r12 /* command.edge2 */
|
||||
mov.w @r2+, r12 /* command.edge2 */
|
||||
mov r5, r0
|
||||
|
||||
mov.l r13, @-r15
|
||||
|
|
|
@ -73,18 +73,19 @@ void azrp_subimage(int x, int y, bopti_image_t const *image,
|
|||
/* This divides by azrp_frag_height */
|
||||
cmd.fragment_id = (azrp_scale == 1) ? (y >> 3) : (y >> 4);
|
||||
|
||||
/* These settings only apply to the first fragment */
|
||||
int first_y = (y + azrp_frag_offset) & (azrp_frag_height - 1);
|
||||
cmd.lines = azrp_frag_height - first_y;
|
||||
cmd.output = 2 * (azrp_width * first_y + x);
|
||||
|
||||
while(height > 0) {
|
||||
cmd.lines = min(height, azrp_frag_height - (y & (azrp_frag_height-1)));
|
||||
azrp_queue_command(&cmd, cmd_size, cmd.fragment_id);
|
||||
|
||||
cmd.output = 2 * (azrp_width * (y & (azrp_frag_height-1)) + x);
|
||||
|
||||
y += cmd.lines;
|
||||
top += cmd.lines;
|
||||
height -= cmd.lines;
|
||||
|
||||
azrp_queue_command(&cmd, cmd_size);
|
||||
cmd.fragment_id++;
|
||||
cmd.input += row_stride * cmd.lines;
|
||||
cmd.lines = min(height, azrp_frag_height);
|
||||
cmd.output = 2 * x;
|
||||
}
|
||||
|
||||
prof_leave(azrp_perf_cmdgen);
|
||||
|
|
Loading…
Reference in New Issue