gint/src/render-cg/image/image_rgb16_normal.S

.global _gint_image_rgb16_normal
#include "image_macros.S"

/* RGB16 Opaque rendering, RAM version: by longword access.

   This function of the image renderer is designed for the RAM model only. At
   default overclock levels, the RAM can register a write every 13-14 cycles,
   regardless of size. Since this amount of time is more than enough to build a
   target longword regardless of alignment and geometry considerations, the
   main and only focus of this function is to only write longwords.

   Since longwords can only be written at 4-aligned addresses and always make
   pairs of pixels, there are variations on the loop depending on the rendered
   width and destination. These are marked with the following convention:

   * w1 / w2 denotes the parity of the command width;
   * o2 / o4 denotes the alignment of the output.

   There is a forward and a backward variation for all four combinations of
   these parameters, noted F_ and B_ in label names. Some word-based variations
   are provided for width ≤ 8, which is just a way to ensure that the longword-
   based loops always have a least one interation, since they're implemented as
   do/while.

   The loops themselves are nowhere near tight on the CPU side and entirely
   bottlenecked by the RAM, hence the simplicity and complete disregard for
   superscalar parallelism. */

_gint_image_rgb16_normal:
	/* We use word copy for width ≤ 8; this is to ensure that there is at
	   least one longword in the non-trivial loop, simplifying checks */
	tst	#1, r0
	mov	#8, r0

	bf.s	.BACKWARD
	cmp/ge	r2, r0

.FORWARD:
	bt	_FORWARD_WORD_COPY
	nop

	bra	_FORWARD_LONG_COPY
	nop

.BACKWARD:
	mov	r2, r0
	add	r0, r0
	add	r0, r5
	add	r0, r0

	bt.s	_BACKWARD_WORD_COPY
	add	r0, r6

	bra	_BACKWARD_LONG_COPY
	nop

_FORWARD_WORD_COPY:
	START
2:	movs.w	@r3+, x0
3:	movs.w	x0, @r5+
	END
	EPILOGUE

_BACKWARD_WORD_COPY:
	START
2:	movs.w	@r3+, x0
3:	movs.w	x0, @-r5
	END
	EPILOGUE

_FORWARD_LONG_COPY:
	shlr	r2		/* Test width parity */
	mov	#2, r0

	bt	.F_w1
	nop

.F_w2:	tst	r0, r5		/* Test alignment of output */
	bf	.F_w2o2

.F_w2o4:
	START
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r7
	xtrct	r0, r7
	mov.l	r7, @r5
3:	add	#4, r5
	END
	EPILOGUE

.F_w2o2:
	add	#-1, r2
	START
	mov.w	@r3+, r0
	mov.w	r0, @r5
	add	#2, r5
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r7
	xtrct	r0, r7
	mov.l	r7, @r5
3:	add	#4, r5
	mov.w	@r3+, r0
	mov.w	r0, @r5
	add	#2, r5
	END
	EPILOGUE

.F_w1:	tst	r0, r5		/* Test alignment of output */
	bf	.F_w1o2

.F_w1o4:
	START
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r7
	xtrct	r0, r7
	mov.l	r7, @r5
3:	add	#4, r5
	mov.w	@r3+, r0
	mov.w	r0, @r5
	add	#2, r5
	END
	EPILOGUE

.F_w1o2:
	START
	mov.w	@r3+, r0
	mov.w	r0, @r5
	add	#2, r5
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r7
	xtrct	r0, r7
	mov.l	r7, @r5
3:	add	#4, r5
	END
	EPILOGUE

_BACKWARD_LONG_COPY:
	shlr	r2		/* Test width parity */
	mov	#2, r0

	bt	.B_w1
	nop

.B_w2:	tst	r0, r5		/* Test alignment of output */
	bf	.B_w2o2

.B_w2o4:
	START
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r0
	xtrct	r7, r0
3:	mov.l	r0, @-r5
	END
	EPILOGUE

.B_w2o2:
	add	#-1, r2
	START
	mov.w	@r3+, r0
	mov.w	r0, @-r5
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r0
	xtrct	r7, r0
3:	mov.l	r0, @-r5
	mov.w	@r3+, r0
	mov.w	r0, @-r5
	END
	EPILOGUE

.B_w1:	tst	r0, r5		/* Test alignment of output */
	bf	.B_w1o2

.B_w1o4:
	START
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r0
	xtrct	r7, r0
3:	mov.l	r0, @-r5
	mov.w	@r3+, r0
	mov.w	r0, @-r5
	END
	EPILOGUE

.B_w1o2:
	START
	mov.w	@r3+, r0
	mov.w	r0, @-r5
2:	mov.w	@r3+, r0
	mov.w	@r3+, r7
	shll16	r0
	xtrct	r7, r0
3:	mov.l	r0, @-r5
	END
	EPILOGUE