722 lines
17 KiB
ArmAsm
722 lines
17 KiB
ArmAsm
/* Azur's built-in shaders: <image>
|
|
|
|
If there ever was a fantastic piece of assembler engineering in my work up
|
|
to this point, this would be it. Every trick in the book is used here, from
|
|
clever instruction combinations, pipeline flow and tricky DSP abuse all the
|
|
way up to memory layout planning, transforms on loop structures, and most
|
|
critically superscalar parallelism.
|
|
|
|
While the performance of the shader is not *strictly* proportional to the
|
|
speed of the tightest loop, it's very close. The use of operand-bus XRAM for
|
|
graphics data, systematic alignment, and detailed pipeline stalling
|
|
measurements for common instruction sequences in gintctl allow very accurate
|
|
speed predictions to be made based on the tightness of the code.
|
|
|
|
The palette formats of bopti have been refined for the purpose of this
|
|
shader, with P8 being split into P8_RGB565A and P8_RGB565 with big changes,
|
|
and P4 being renamed P4_RGB565A with minimal changes along with a variation
|
|
aptly named P4_RGB565.
|
|
|
|
The asymptotic performance for each format is as follows:
|
|
* RGB565: 1 cycle/pixel if source and destination align
|
|
2 cycles/pixel otherwise
|
|
* RGB565A: 4 cycles/pixel
|
|
* P8_RGB565A: 4.5 cycles/pixel
|
|
* P8_RGB565: 3 cycles/pixel
|
|
* P4_RGB565A: 5 cycles/pixel
|
|
* P4_RGB565: 3.5 cycles/pixel
|
|
|
|
Entirely documenting this code would take me hours, but some elements are
|
|
provided in the comments. Superscalar parallelism is most easily appreciated
|
|
by reading the two-page section 4.2 of the SH4AL-DSP manual. The other main
|
|
structural technique at play in this code is loop transforms.
|
|
|
|
Basically, a loop that loads a pixel, performs computations with it, and
|
|
writes the result is inefficient because of the RAW dependencies on most
|
|
operations (with full stall cycles between loads and computations, and
|
|
between computations and uses as addresses). Well-established loop
|
|
optimization literature has lots of techniques to help with this problem,
|
|
and I use two here:
|
|
|
|
* _Pipelining_ the loop consists in handling a single pixel over several
|
|
iterations by doing a little bit of work in each iteration. The data for
|
|
the pixel would move from register to register at each iteration, with the
|
|
loop code doing one stage's worth of computation on each register. (You
|
|
can view it as a diagonal iteration pattern in the pixel*instruction grid
|
|
if you like such visualizations.)
|
|
|
|
By increasing the number of pixels in the pipeline, a lot of independent
|
|
data can be obtained, reducing dependency pressure and allowing for
|
|
greater parallelism at the cost of more registers being used.
|
|
|
|
The use of pipelining in this shader is very modest, with 2 stages at
|
|
most, and usually only a couple of instructions being performed in advance
|
|
for the next pixel while the current one finishes processing. Register
|
|
assignments have some subtleties though since pressure is high overall.
|
|
|
|
* _Unrolling_ iterations of the loop consists in loading two (or more)
|
|
pixels at the start of each iteration so that we can work on one while
|
|
waiting for stalls and dependencies on the other.
|
|
|
|
Unlike pipelining, a loop iteration starts and ends with full pixels and
|
|
no work carries between iterations. Unrolling allows different pixels to
|
|
use different registers and generally better optimize the instruction
|
|
sequence, at the cost of only supporting pixel counts that are multipes of
|
|
the unrolling level.
|
|
|
|
Handling non-multiple sizes is the everlasting bane of unrolled loops,
|
|
sometimes requiring duplicate code. Smart maneuvers are used in P8 and P4
|
|
to only handle even sizes and neutralize unwanted pixels after the fact.
|
|
|
|
Both techniques are used simultaneously, with 2-unrolled 2-stage loops for
|
|
almost all formats (except RGB556A which performs DSP trickery).
|
|
*/
|
|
|
|
.global _azrp_shader_image
|
|
.align 4
|
|
|
|
/* Register assignment
|
|
r0: (temporary)
|
|
r1: Lines
|
|
r2: Command queue; (temporary)
|
|
r3: Input
|
|
r4: [parameter] azrp_width*2; output stride
|
|
r5: [parameter] Command queue; Output
|
|
r6: [parameter] azrp_frag; alpha value; (temporary)
|
|
r7: Columns
|
|
r8: Image pointer; (temporary)
|
|
r9: Input stride */
|
|
_azrp_shader_image:
|
|
mov.l r8, @-r15
|
|
add #2, r5
|
|
|
|
mov.l r9, @-r15
|
|
mov r5, r2
|
|
|
|
mov.w @r2+, r7 /* command.columns */
|
|
|
|
mov.l @r2+, r8 /* command.image */
|
|
|
|
mov.w @r2+, r5 /* command.output (offset) */
|
|
sub r7, r4
|
|
|
|
mov.w @r8+, r9 /* image.profile */
|
|
sub r7, r4
|
|
|
|
mov.w @r2+, r1 /* command.lines */
|
|
add r6, r5
|
|
|
|
mov.l @r2+, r3 /* command.input (pointer) */
|
|
shll2 r9
|
|
|
|
mova .formats, r0
|
|
|
|
mov.w @r8+, r6 /* image.alpha */
|
|
|
|
mov.l @(r0,r9), r0
|
|
|
|
mov.w @r8+, r9 /* image.width */
|
|
|
|
jmp @r0
|
|
nop
|
|
|
|
.align 4
|
|
.formats:
|
|
.long _RGB565
|
|
.long _RGB565A
|
|
.long _NOP /* P8 */
|
|
.long _P4_RGB565A /* =P4 */
|
|
.long _P8_RGB565
|
|
.long _P8_RGB565A
|
|
.long _P4_RGB565
|
|
|
|
/* [Loop macros]
|
|
|
|
The following macros implement the main loop of the image renderer.
|
|
* Each line is rendered in the tight loop between 2: and 3: (both included).
|
|
* r5 is the output (with stride r4, in bytes)
|
|
* r3 is the input (with stride r9, in bytes)
|
|
* There are r1 rows with r7 iterations each */
|
|
|
|
#define START() \
|
|
nop; /* 4-alignment */ \
|
|
ldrs 2f; \
|
|
ldre 3f; \
|
|
1: ldrc r7
|
|
|
|
#define END_NORET() \
|
|
dt r1; \
|
|
add r4, r5; \
|
|
bf.s 1b; \
|
|
add r9, r3
|
|
|
|
#define END() \
|
|
END_NORET(); \
|
|
mov.l @r15+, r9; \
|
|
rts; \
|
|
mov.l @r15+, r8
|
|
|
|
/* [Rendering strategy for the RGB565 format]
|
|
|
|
In RGB565, all pixels are copied verbatim. This is a 2D memcpy, which we can
|
|
optimize by moving longwords. Since longwords are pairs of pixels, there are
|
|
variations and subcases based on the parity of each parameter:
|
|
|
|
* w[eo] denotes whether the width of the image is even or odd;
|
|
* d[eo] denotes whether the memory accesses to the source and destination
|
|
are even (4-aligned) or odd (2-aligned).
|
|
|
|
When the destination and source have identical parity, the d[eo] variation
|
|
can be defined. In this case the copy is pretty direct, it's a longword copy
|
|
and it takes 2 cycles to copy 4 bytes, plus some extra at the edges if the
|
|
start or end address is 2-aligned.
|
|
|
|
However, when they have opposite parity, each longword read matches up with
|
|
a 2-aligned write (or vice-versa). Rearranging words with arithmetic does
|
|
not help because of the stall cycle between loading a register and using it
|
|
in the ALU, which makes the minimum time 4 cycles for 2 pixels (the same as
|
|
the word-based copy). Unrolling iterations could help but would be too
|
|
complex here (adding sub-cases); a super-heavy renderer with more hypotheses
|
|
(like a tileset shader) should aim for that route though. Also, movua.l
|
|
followed by mov.l is even slower (5 cycles). */
|
|
.align 4
|
|
_RGB565:
|
|
mov #8, r0 /* Maximum width for naive method */
|
|
sub r7, r9
|
|
|
|
cmp/ge r7, r0
|
|
|
|
shll r9
|
|
|
|
bt.s _RGB565.naive
|
|
mov #2, r0
|
|
|
|
/* Use naive method for opposite source/destination parity */
|
|
mov r5, r6
|
|
xor r3, r6
|
|
|
|
tst r0, r6
|
|
bf _RGB565.naive
|
|
|
|
shlr r7
|
|
bt _RGB565.wo
|
|
|
|
_RGB565.we:
|
|
tst r0, r5
|
|
bf _RGB565.we_do
|
|
|
|
/* This is 4-aligned */
|
|
_RGB565.we_de:
|
|
START()
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r5+
|
|
END()
|
|
|
|
.align 4
|
|
_RGB565.we_do:
|
|
add #-1, r7
|
|
|
|
START()
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r5+
|
|
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r5+
|
|
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r5+
|
|
END()
|
|
|
|
.align 4
|
|
_RGB565.wo:
|
|
tst r0, r5
|
|
bf _RGB565.wo_do
|
|
|
|
_RGB565.wo_de:
|
|
START()
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r5+
|
|
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r5+
|
|
END()
|
|
|
|
.align 4
|
|
_RGB565.wo_do:
|
|
START()
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r5+
|
|
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r5+
|
|
END()
|
|
|
|
/* Naive method for small widths and opposite source/destination parity */
|
|
.align 4
|
|
_RGB565.naive:
|
|
START()
|
|
2: movs.w @r3+, x0
|
|
3: movs.w x0, @r5+
|
|
END()
|
|
|
|
/* [Rendering strategy for the RGB565A format]
|
|
|
|
Since we have to check for the alpha value in each pixel, there's really no
|
|
longword-based optimization. Instead, we just go as fast as possible with
|
|
each pixel, using DSP instructions because conditional execution is pretty
|
|
damn good. This takes 4 cycles/pixel. I tried a number of reductions to
|
|
3 cycles/pixel but could not get any of them to work. */
|
|
.align 4
|
|
_RGB565A:
|
|
shll16 r6
|
|
mov #0x0004, r0 /* DC Zero mode */
|
|
|
|
sub r7, r9
|
|
|
|
shll r9
|
|
|
|
lds r6, y0
|
|
|
|
lds r0, dsr
|
|
|
|
START()
|
|
2: movs.w @r3+, x0
|
|
pcmp x0, y0 movx.w @r5, x1
|
|
dct pcopy x1, x0
|
|
3: movx.w x0, @r5+
|
|
END()
|
|
|
|
/* [Rendering strategy for the P8_RGB565A format]
|
|
|
|
The work needed for each pixel gets more difficult as we go, with alpha
|
|
being the major culprit due to its additional comparisons, jumps, and
|
|
limited optimization opportunities when unrolling due to conditionally-
|
|
executed code.
|
|
|
|
Because arithmetic is unavoidable and there are 1-cycle delays between both
|
|
loading-arithmetic, and arithmetic-indexing pairs, the loop has 2-unrolled
|
|
iterations with a 2-stage pipeline structure. This fills the stall cycles
|
|
and increases parallelism significantly. Pure loop optimization handbook.
|
|
|
|
Dealing with odd widths is a major pain as usual. Instead of adding logic to
|
|
handle the extra pixel separately, this routine lets the loop overwrite it,
|
|
then restores its original value afterwards - a delightfully elegant trick.
|
|
|
|
The P8 format is actually so bad that spending precious time grinding cycles
|
|
felt completely inappropriate without first refining it. This led to two new
|
|
variations, P8_RGB565 and P8_RGB565A, which fix the following problems.
|
|
|
|
-> First there is alpha for all images, which is the most costly feature,
|
|
single-handedly accounting for half of the work per pixel. P8_RGB565
|
|
does no support alpha, which basically doubles performance.
|
|
|
|
-> Then, there is the alpha value itself. In P8 it is a variable (and fxconv
|
|
sets it to 0xff), which burns a register for the comparison and enforces
|
|
a fixed order between comparison and left-shift. P8_RGB565A always sets
|
|
an alpha value of 0x00 which lifts both constraints.
|
|
|
|
-> Then, there are palette indices. In P8 they are unsigned, which requires
|
|
an extu.b. In P8_RGB565 and P8_RGB565A they are signed, so the sign-
|
|
extended value of the mov.b can be used directly (once doubled). The
|
|
palette base is simply offset by 128 entries, with colors numbered
|
|
-128..-1 first and only then 0..127.
|
|
|
|
-> Finally, there's the palette itself. In P8 it always has 256 entries,
|
|
even when only a few are used. For small images this is a huge waste, so
|
|
P8_RGB565 and P8_RGB565A only store colors that are actually used.
|
|
|
|
P8_RGB565A achieves 4.5 cycles/pixel asymptotically, which is really good
|
|
compared to 4 cycles/pixel for RGB565A. */
|
|
.align 4
|
|
_P8_RGB565A:
|
|
mov.l r13, @-r15
|
|
sub r7, r9
|
|
|
|
mov r7, r13
|
|
add #-2, r9 /* Input stride compensation for pipelining */
|
|
|
|
mov.l r12, @-r15
|
|
shlr r7
|
|
|
|
mov.l r10, @-r15
|
|
movt r6
|
|
|
|
mov.w _P8_RGB565A.palette_distance, r0
|
|
shll r13
|
|
|
|
add r6, r7
|
|
|
|
sub r6, r9
|
|
|
|
sub r6, r4
|
|
|
|
sub r6, r4
|
|
|
|
add r0, r8
|
|
|
|
add r5, r13
|
|
mov r7, r2
|
|
|
|
add #-4, r5 /* Output offset compensation in the loop */
|
|
|
|
shll2 r2
|
|
|
|
add r4, r2
|
|
|
|
START()
|
|
|
|
mov.b @r3+, r6
|
|
|
|
/* Save next pixel for the odd-width case */
|
|
mov.w @r13, r12
|
|
|
|
mov.b @r3+, r10
|
|
tst r6, r6
|
|
|
|
/* 2-unrolled 2-stage main loop */
|
|
2: add r6, r6
|
|
mov r6, r0
|
|
|
|
add r10, r10
|
|
bt.s 5f
|
|
|
|
tst r10, r10
|
|
mov.w @(r0,r8), r0
|
|
|
|
mov.w r0, @(4,r5)
|
|
|
|
5: mov.b @r3+, r6
|
|
mov r10, r0
|
|
|
|
bt.s 6f
|
|
add #4, r5
|
|
|
|
mov.w @(r0,r8), r0
|
|
|
|
mov.w r0, @(2,r5)
|
|
|
|
6: mov.b @r3+, r10
|
|
3: tst r6, r6
|
|
|
|
/* Restore last pixel */
|
|
mov.w r12, @r13
|
|
add r2, r13
|
|
|
|
END_NORET()
|
|
mov.l @r15+, r10
|
|
mov.l @r15+, r12
|
|
mov.l @r15+, r13
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|
|
|
|
_P8_RGB565A.palette_distance:
|
|
/* Distance between image pointer and palette array base */
|
|
.word 260
|
|
|
|
/* [Rendering strategy for the P8_RGB565 format]
|
|
|
|
See P8_RGB565A for format details. Removing the checks for transparency and
|
|
the jumps simplifies the instruction sequence and allows superior
|
|
parallelism because all paths are unconditional. This routines achieves
|
|
3 cycles/pixel asymptotically. */
|
|
.align 4
|
|
_P8_RGB565:
|
|
mov.l r13, @-r15
|
|
sub r7, r9
|
|
|
|
mov r7, r13
|
|
add #-2, r9 /* Input stride compensation for pipelining */
|
|
|
|
mov.l r12, @-r15
|
|
shlr r7
|
|
|
|
mov.l r10, @-r15
|
|
movt r6
|
|
|
|
mov.w _P8_RGB565.palette_distance, r0
|
|
shll r13
|
|
|
|
add r6, r7
|
|
|
|
sub r6, r9
|
|
|
|
sub r6, r4
|
|
|
|
sub r6, r4
|
|
|
|
add r0, r8
|
|
|
|
add r5, r13
|
|
|
|
add #-4, r5 /* Output offset compensation in the loop */
|
|
mov r7, r2
|
|
|
|
shll2 r2
|
|
|
|
add r4, r2
|
|
|
|
START()
|
|
|
|
mov.b @r3+, r0
|
|
|
|
/* Save next pixel for the odd-width case */
|
|
mov.w @r13, r12
|
|
|
|
mov.b @r3+, r10
|
|
shll r0
|
|
|
|
/* 2-unrolled 2-stage main loop */
|
|
2: mov.b @r3+, r6
|
|
shll r10
|
|
|
|
mov.w @(r0,r8), r0
|
|
/* This nop is not for show, it actually prevents the loop from slowing
|
|
down to 7 cycles /i, probably due to instruction reads alignment. */
|
|
nop
|
|
|
|
mov.w r0, @(4,r5)
|
|
mov r10, r0
|
|
|
|
mov.b @r3+, r10
|
|
add #4, r5
|
|
|
|
mov.w @(r0,r8), r0
|
|
shll r6
|
|
|
|
mov.w r0, @(2,r5)
|
|
3: mov r6, r0
|
|
|
|
/* Restore last pixel */
|
|
mov.w r12, @r13
|
|
add r2, r13
|
|
|
|
END_NORET()
|
|
mov.l @r15+, r10
|
|
mov.l @r15+, r12
|
|
mov.l @r15+, r13
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|
|
|
|
_P8_RGB565.palette_distance:
|
|
/* Distance between image pointer and palette array base */
|
|
.word 260
|
|
|
|
/* [Rendering strategy for the P4_RGB565A format]
|
|
|
|
This is the most complex format. Most of the remarks that apply to
|
|
P8_RGB565A also apply here, except that there are less opportunities to save
|
|
computation because nibbles must be extracted anyway.
|
|
|
|
The P4_RGB565A format is simply bopti's P4, but an additional variation
|
|
P4_RGB565 is specified to save on transparency handling, which is very
|
|
expensive.
|
|
|
|
The special nature of the nibble packing means the simplest loop form writes
|
|
2 pixels from a 2-aligned source image position in a single iteration. Other
|
|
structures don't even come close: selecting nibbles individually is folly,
|
|
while not unrolling is inefficient. So the whole point of this routine is to
|
|
forcibly align the subimage on a byte-aligned and never break that grid.
|
|
|
|
The command builder for P4 does this alignment before submitting the
|
|
command. Obviously the transform can cause one extra pixel to be overridden
|
|
on each side of every line. The command is thus extended with two edge
|
|
offsets indicating pixels to preserve at each end. When overwrites occurs,
|
|
the edge offsets point to the overwritten pixels so they can be restored.
|
|
Otherwise, they point to the next pixels and the restores are no-ops. See
|
|
the strategy used for managing unrolling in P8 formats for details.
|
|
|
|
The only irregularity is image width, which the command builder cannot
|
|
modify. It is rounded up to the next multiple of 2, then halved. There is a
|
|
nice trick for this operation, which is [shlr rX] then adding T to rX. We
|
|
also need to add -1 for another adjustement, and both are combined into an
|
|
addc, which saves one add and one movt off the EX critical chain.
|
|
|
|
The main loop achieves 5 cycles/pixel. */
|
|
.align 4
|
|
_P4_RGB565A:
|
|
shlr r9
|
|
mov #-1, r0
|
|
|
|
mov.l r10, @-r15
|
|
addc r0, r9
|
|
|
|
mov.l r11, @-r15
|
|
shlr r7
|
|
|
|
mov.l r12, @-r15
|
|
sub r7, r9
|
|
|
|
mov.w @r2+, r11 /* command.edge1 */
|
|
add #2, r8 /* image.palette */
|
|
|
|
mov.w @r2+, r12 /* command.edge2 */
|
|
mov r5, r0
|
|
|
|
mov.l r13, @-r15
|
|
shll r11
|
|
|
|
mov.l r14, @-r15
|
|
shll r12
|
|
|
|
add #-4, r5
|
|
nop /* 4-alignment */
|
|
|
|
START()
|
|
|
|
mov.b @r3+, r6
|
|
mov r0, r10
|
|
|
|
mov.w @(r0,r11), r13
|
|
|
|
mov.w @(r0,r12), r14
|
|
shll r6
|
|
|
|
/* Main loop with 2 pixels sharing a single byte */
|
|
2: mov r6, r0
|
|
and #0x1e, r0
|
|
|
|
tst r0, r0
|
|
|
|
bt.s 4f
|
|
shlr2 r6
|
|
|
|
mov.w @(r0,r8), r0
|
|
|
|
mov.w r0, @(6,r5)
|
|
4: shlr2 r6
|
|
|
|
mov r6, r0
|
|
and #0x1e, r0
|
|
|
|
tst r0, r0
|
|
mov.b @r3+, r6
|
|
|
|
bt.s 5f
|
|
add #4, r5
|
|
|
|
mov.w @(r0,r8), r0
|
|
|
|
mov.w r0, @r5
|
|
3: 5: shll r6
|
|
|
|
mov r10, r0
|
|
mov r7, r10
|
|
|
|
shll2 r10
|
|
|
|
mov.w r13, @(r0,r11)
|
|
add r4, r10
|
|
|
|
mov.w r14, @(r0,r12)
|
|
add r0, r10
|
|
|
|
mov r10, r0
|
|
/* Parallelizes with [dt r1] expanded from END_NORET() */
|
|
|
|
END_NORET()
|
|
mov.l @r15+, r14
|
|
mov.l @r15+, r13
|
|
mov.l @r15+, r12
|
|
mov.l @r15+, r11
|
|
mov.l @r15+, r10
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|
|
|
|
/* [Rendering strategy for the P4_RGB565 format]
|
|
Same as P4_RGB565A without transparency checks (fairly straightforward). The
|
|
core loop runs in 3.5 cycles/pixel. */
|
|
.align 4
|
|
_P4_RGB565:
|
|
shlr r9
|
|
mov #-1, r0
|
|
|
|
mov.l r10, @-r15
|
|
addc r0, r9
|
|
|
|
mov.l r11, @-r15
|
|
shlr r7
|
|
|
|
mov.l r12, @-r15
|
|
sub r7, r9
|
|
|
|
mov.w @r2+, r11 /* command.edge1 */
|
|
add #2, r8 /* image.palette */
|
|
|
|
mov.w @r2+, r12 /* command.edge2 */
|
|
mov r5, r0
|
|
|
|
mov.l r13, @-r15
|
|
shll r11
|
|
|
|
mov.l r14, @-r15
|
|
shll r12
|
|
|
|
add #-4, r5
|
|
mov #0x1e, r2
|
|
|
|
START()
|
|
|
|
mov.b @r3+, r6
|
|
mov #-4, r10
|
|
|
|
mov.l r0, @-r15
|
|
|
|
mov.w @(r0,r11), r13
|
|
|
|
mov.w @(r0,r12), r14
|
|
shll r6
|
|
|
|
/* Main loop with 2 pixels sharing a single byte */
|
|
2: mov r6, r0
|
|
and #0x1e, r0
|
|
|
|
shld r10, r6
|
|
|
|
mov.w @(r0,r8), r0
|
|
and r2, r6
|
|
|
|
mov.w r0, @(6,r5)
|
|
mov r6, r0
|
|
|
|
mov.b @r3+, r6
|
|
add #4, r5
|
|
|
|
mov.w @(r0,r8), r0
|
|
|
|
mov.w r0, @r5
|
|
3: shll r6
|
|
|
|
mov.l @r15+, r0
|
|
mov r7, r10
|
|
|
|
shll2 r10
|
|
|
|
mov.w r13, @(r0,r11)
|
|
add r4, r10
|
|
|
|
mov.w r14, @(r0,r12)
|
|
add r0, r10
|
|
|
|
mov r10, r0
|
|
/* Parallelizes with [dt r1] expanded from END_NORET() */
|
|
|
|
END_NORET()
|
|
mov.l @r15+, r14
|
|
mov.l @r15+, r13
|
|
mov.l @r15+, r12
|
|
mov.l @r15+, r11
|
|
mov.l @r15+, r10
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|
|
|
|
/* [Unsupported formats]
|
|
P8 is unsupported, use P8_RGB565 and P8_RGB565A. */
|
|
_NOP:
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|