gint/src/render-cg/image/image_p8_clearbg.S

151 lines
3.0 KiB
ArmAsm

.global _gint_image_p8_clearbg
#include "image_macros.S"
/* P8 CLEARBG, RAM version: by NULL canceling.
This function is one of the few that can still be bottlenecked by CPU in the
RAM model. This is because transparent pixels can be skipped over as fast as
the CPU allows without worrying about the writing speed of the RAM.
For some reason that I have yet to uncover, branches are way slower than the
SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This
completely favors branchless methods, and the one used here is one I call
"NULL canceling".
The idea is that a write can be turned into a no-op by either writing the
value that is already in memory, or by writing somewhere else. The first
option is pretty slow, especially because it requires a selection operation
(rn = condition ? rn : rm) which is like the most general branchless trick.
NULL canceling abuses the fact that NULL is mapped read-only on the platform
to turn the target pointer in NULL with the following identity:
target & -(condition) = (condition ? target : NULL)
The term -(condition) is materialized with an [addc #-1, #0] instruction
after the test, then the result is applied onto the target pointer with
[and], completing the trick in only 2 EX instructions. It does take more
registers, and prevents from using pre-decrement on the target.
r0: [temporary]
r7: Right edge pointer
r8: Alpha value
r9: Palette
r10: Nullable output pointer
r11: 0 (to neutralize addc during NULL-cancelling)
r12: Right edge stride
r13: [temporary]
r14: [temporary]
Spilled to stack:
@(-4,r15): Right edge value */
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
mov.l @r8+, r9 /* cmd.palette */
shlr r2
mov.w @r8+, r7 /* cmd.edge_2 */
mov r2, r0
mov.l r12, @-r15
shll2 r0
mov.l r10, @-r15
shll r7
mov.l r11, @-r15
add r5, r7
mov r0, r12
add r6, r12
mov.l r13, @-r15
add #-4, r5
mov.l r14, @-r15
add #-2, r4 /* Input stride compensation for pipelining */
mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */
mov #0, r11
.if \HFLIP
add r0, r5
nop
shll r0
nop
add r0, r6
nop
.endif
exts.b r8, r8
nop
START
mov.b @r3+, \TMP2
nop
mov.w @r7, r0 /* Save right edge */
nop
mov.l r0, @-r15
cmp/eq \TMP2, r8
mov.b @r3+, \TMP1
add \TMP2, \TMP2
2: mov #-1, r10
addc r11, r10 /* r10 is now the mask */
and r5, r10
mov \TMP2, r0
cmp/eq \TMP1, r8
mov.w @(r0, r9), r0
mov.w r0, @(\OFF1, r10)
add #\OUT_DIR, r5
mov.b @r3+, \TMP2
nop
mov #-1, r10
addc r11, r10
add \TMP1, \TMP1
mov \TMP1, r0
mov.b @r3+, \TMP1
and r5, r10
mov.w @(r0, r9), r0
cmp/eq \TMP2, r8
mov.w r0, @(\OFF2, r10)
3: add \TMP2, \TMP2
mov.l @r15+, r0
nop
mov.w r0, @r7 /* Restore right edge */
add r12, r7
END
mov.l @r15+, r14
mov.l @r15+, r13
mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r12
EPILOGUE
.endm
_gint_image_p8_clearbg:
tst #1, r0
bf 9f
GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4