gint/src/render-cg/image/image_p8_swapcolor.S

78 lines
2.0 KiB
ArmAsm

.global _gint_image_p8_swapcolor
#include "image_macros.S"
/* P8 SWAPCOLOR, RAM version: by branchless xor selection.
The core action of this loop is to render full pixels while replacing any
occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is
too slow as often, so instead we use the fact that both x and y are fixed to
use the identity
c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c)
We materialize -(c == x) by subtracting a register from itself with subc
after the comparison (which is delightfully elegant), while (x ^ y) is pre-
computed. This way, the selection is performed in one [subc], one [and] and
one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling
(which only takes 2 EX slots) but still better than symmetric alternatives.
Since we have a palette, we further trick by comparing against the index but
selecting against the palette entry, ie. we do
palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c])
which allows the computation to occur in parallel with the palette access
and does not require the replacement value to be located at a valid index.
r0: [temporary]
r7: cmd.color_1
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
r9: Palette
r10: Holds (x ^ y) & -(c == x) during selection */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
mov.l @r8+, r9 /* cmd.palette */
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
mov.w @r8+, r7 /* cmd.color_1 */
mov.l r10, @-r15
exts.b r7, r7
mov r7, r0
mov.w @r8, r8 /* cmd.color_2 */
add r0, r0
mov.w @(r0, r9), r0
xor r0, r8
.if \HFLIP
add #-2, r5
mov r2, r0
shll r0
add r0, r5
shll r0
add r0, r6
.endif
START
2: mov.b @r3+, r0
cmp/eq r0, r7
add r0, r0
subc r10, r10
mov.w @(r0, r9), r0
and r8, r10
xor r10, r0
mov.w r0, @r5
3: add #\OUT_DIR, r5
END
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p8_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 2
9: GEN_SWAPCOLOR_LOOP 1, -2