gint/src/render-cg/image/image_p4_swapcolor.S

176 lines
2.5 KiB
ArmAsm

.global _gint_image_p4_swapcolor
#include "image_macros.S"
/* P4 SWAPCOLOR, RAM version: by branchless xor selection.
I'm not sure whether this is the most optimized version for RAM. But it's
about 7-8% slower than bopti, and the effort of writing yet another
variation of P4's arduous loops doesn't seem worth it for a rare dynamic
effect. This is Azur's version.
See P8 SWAPCOLOR for an explanation of branchless xor selection.
r0: [temporary]
r7: Right edge pointer
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
r9: Palette
r10: Left edge pointer
r11: Holds (x ^ y) & -(c == x) during selection
r12: cmd.color_1
r13: [temporary]
r14: [temporary] (in outer loop: edge stride)
Spilled to stack:
@(-12,r15): Right edge value
@(-8,r15): Left edge value
@(-4,r15): Edge stride */
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
shlr r2
nop
add r10, r10
nop
mov.l @r8+, r9 /* cmd.palette */
mov r2, r0
mov.w @r8+, r7 /* cmd.edge_2 */
shll2 r0
mov.l r12, @-r15
shll r7
mov.l r13, @-r15
add r5, r7
mov.w @r8+, r13 /* cmd.color_1 */
add r5, r10
mov.l r11, @-r15
add #-4, r5
mov r13, r12
shll r13
mov.l r14, @-r15
add r9, r13
mov.w @r8, r8 /* cmd.color_2 */
add #-1, r4 /* Input stride compensation for pipelining */
mov.w @r13, r13
mov r0, r14
add r6, r14
nop
xor r13, r8
nop
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
shll r12 /* Compare color_1 * 2 with shifted values */
nop
START
mov.b @r3+, \TMP1
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
shll \TMP1
mov.w @r10, r0 /* Save left edge */
nop
mov.l r0, @-r15
nop
mov.l r14, @-r15
nop
2: mov \TMP1, r0
and #0x1e, r0
cmp/eq r0, r12
mov #-4, \TMP2
subc r11, r11
nop
mov.w @(r0,r9), r0
and r8, r11
shld \TMP2, \TMP1
mov #0x1e, \TMP2
xor r11, r0
mov.w r0, @(\OFF1,r5)
and \TMP2, \TMP1
nop
cmp/eq \TMP1, r12
nop
subc r11, r11
mov \TMP1, r0
add #\OUT_DIR, r5
mov.b @r3+, \TMP1
and r8, r11
mov.w @(r0,r9), r0
shll \TMP1
nop
xor r11, r0
3: mov.w r0, @(\OFF2,r5)
mov.l @r15+, r14
nop
mov.l @r15+, r0
nop
mov.w r0, @r10 /* Restore left edge */
add r14, r10
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r14, r7
END
mov.l @r15+, r14
mov.l @r15+, r11
mov.l @r15+, r13
mov.l @r15+, r12
mov.l @r15+, r10
EPILOGUE
.endm
_gint_image_p4_swapcolor:
tst #1, r0
bf 9f
GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 6, 0
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 0, 6