Azur/azur/src/gint/shaders/image_rgb16_normal.S

125 lines
2.6 KiB
ArmAsm

.global _azrp_image_shader_rgb16_normal
#include "image_macros.S"
/* RGB16 Opaque rendering, Azur version: by straightforward copy.
This function of the image renderer is designed for Azur's streaming model
only. Unlike its RAM-model counterpart which is bottlenecked by its writing
speed, this function is entirely limited by the CPU's ability to output the
data in the required format.
In the simple case where there is no color effect and no HFLIP, the task of
rendering a 16-bit opaque image boils down to a 2-dimensional memcpy. This
task can be optimized by moving longwords if the source and destination are
co-4-aligned, with four variations depending on the width and initial
position, identified by the following parameters:
* w1 / w2 denotes the parity of the command width;
* o2 / o4 denotes the alignment of the output.
It is easy to see that when input and output are not co-aligned, any attempt
to combine two word reads into a single long write requires at least 3
cycles per 2 pixels and needs parallelism over several pixels to not get
immediately shut down by the LS-to-EX delay. Here we decide to naively copy
by words, which achieves 4 cycles per 2 pixels, mainly because large RGB16
images are very quickly bottlenecked in reading by their own size anyway.
The HFLIP version also needs to rearrange pixels, and is thus performed with
word-based copies in all situations, which is a straightforward process. */
_azrp_image_shader_rgb16_normal:
/* Not a single cycle */
tst #1, r0
bf _BACKWARD_WORD_COPY
mov #8, r0 /* Use the naive method for width 8 */
cmp/ge r2, r0
bt.s _FORWARD_WORD_COPY
nop
mov r5, r0 /* Check if r3 and r5 are co-aligned */
xor r3, r0
/* Not a single cycle */
tst #2, r0
bt _FORWARD_LONG_COPY
_FORWARD_WORD_COPY:
START
2: movs.w @r3+, x0
3: movs.w x0, @r5+
END
EPILOGUE
_FORWARD_LONG_COPY:
shlr r2 /* Test width parity */
mov #2, r0
bt .w1
nop
.w2: tst r0, r3 /* Test alignment of input */
bf .w2d2
.w2d4: START
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END
EPILOGUE
.w2d2: add #-1, r2
nop
START
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END
EPILOGUE
.w1: tst r0, r3 /* Test alignment of input */
bf .w1d2
.w1d4: START
2: movs.l @r3+, x0
3: movs.l x0, @r5+
movs.w @r3+, x0
movs.w x0, @r5+
END
EPILOGUE
.w1d2: START
movs.w @r3+, x0
movs.w x0, @r5+
2: movs.l @r3+, x0
3: movs.l x0, @r5+
END
EPILOGUE
_BACKWARD_WORD_COPY:
mov r2, r0
shll r0
add r0, r5
nop
shll r0
nop
add r0, r6
nop
START
2: movs.w @r3+, x0
3: movs.w x0, @-r5
END
EPILOGUE