gint/src/render-cg/image/image_p8_clearbg.S

.global _gint_image_p8_clearbg
#include "image_macros.S"

/* P8 CLEARBG, RAM version: by NULL canceling.

   This function is one of the few that can still be bottlenecked by CPU in the
   RAM model. This is because transparent pixels can be skipped over as fast as
   the CPU allows without worrying about the writing speed of the RAM.

   For some reason that I have yet to uncover, branches are way slower than the
   SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This
   completely favors branchless methods, and the one used here is one I call
   "NULL canceling".

   The idea is that a write can be turned into a no-op by either writing the
   value that is already in memory, or by writing somewhere else. The first
   option is pretty slow, especially because it requires a selection operation
   (rn = condition ? rn : rm) which is like the most general branchless trick.

   NULL canceling abuses the fact that NULL is mapped read-only on the platform
   to turn the target pointer in NULL with the following identity:

      target & -(condition) = (condition ? target : NULL)

   The term -(condition) is materialized with an [addc #-1, #0] instruction
   after the test, then the result is applied onto the target pointer with
   [and], completing the trick in only 2 EX instructions. It does take more
   registers, and prevents from using pre-decrement on the target.

   r0:  [temporary]
   r7:  Right edge pointer
   r8:  Alpha value
   r9:  Palette
   r10: Nullable output pointer
   r11: 0 (to neutralize addc during NULL-cancelling)
   r12: Right edge stride
   r13: [temporary]
   r14: [temporary]

   Spilled to stack:
   @(-4,r15): Right edge value */

.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
	mov.l	@r8+, r9	/* cmd.palette */
	shlr	r2

	mov.w	@r8+, r7	/* cmd.edge_2 */
	mov	r2, r0

	mov.l	r12, @-r15
	shll2	r0

	mov.l	r10, @-r15
	shll	r7

	mov.l	r11, @-r15
	add	r5, r7

	mov	r0, r12
	add	r6, r12

	mov.l	r13, @-r15
	add	#-4, r5

	mov.l	r14, @-r15
	add	#-2, r4		/* Input stride compensation for pipelining */

	mov.w	@r8, r8		/* cmd.color_1 ≤ 255, thus zero-extended */
	mov	#0, r11

 .if \HFLIP
	add	r0, r5
	nop

	shll	r0
	nop

	add	r0, r6
	nop
 .endif

	exts.b	r8, r8
	nop

	START

	mov.b	@r3+, \TMP2
	nop

	mov.w	@r7, r0		/* Save right edge */
	nop

	mov.l	r0, @-r15
	cmp/eq	\TMP2, r8

	mov.b	@r3+, \TMP1
	add	\TMP2, \TMP2

2:	mov	#-1, r10
	addc	r11, r10	/* r10 is now the mask */

	and	r5, r10
	mov	\TMP2, r0

	cmp/eq	\TMP1, r8
	mov.w	@(r0, r9), r0

	mov.w	r0, @(\OFF1, r10)
	add	#\OUT_DIR, r5

	mov.b	@r3+, \TMP2
	nop

	mov	#-1, r10
	addc	r11, r10

	add	\TMP1, \TMP1
	mov	\TMP1, r0

	mov.b	@r3+, \TMP1
	and	r5, r10

	mov.w	@(r0, r9), r0
	cmp/eq	\TMP2, r8

	mov.w	r0, @(\OFF2, r10)
3:	add	\TMP2, \TMP2

	mov.l	@r15+, r0
	nop

	mov.w	r0, @r7		/* Restore right edge */
	add	r12, r7

	END

	mov.l	@r15+, r14
	mov.l	@r15+, r13
	mov.l	@r15+, r11
	mov.l	@r15+, r10
	mov.l	@r15+, r12
	EPILOGUE
.endm

_gint_image_p8_clearbg:
	tst	#1, r0
	bf	9f

	GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2
9:	GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4