MandAzur/src/iteration.S

/* Iteration of a quadratic function z -> z²+c on complex inputs. This version
   is by Lephe (2023) and fairly optimized.

   Notes for optimizers:
   - The entire loop is an obvious EX chain, the only stuff that parallelizes
     are the sts and the branches.
   - I gained 15% speed from the original by getting as many sts to parallelize
     as possible, which required tightening up dmuls.l -> delay -> sts chains.
   - The code is strongly affected by alignment, is somehow very sensitive to
     the initialization code, and does not behave as I expect it to in most
     ways. Probably none of it executes as I envision. */

.global _quadratic_iteration_32
.text
.balign 4

#define _zx    r0   /* Re(z) */
#define _zy    r1   /* Im(z) */
#define _zx2   r2   /* _zx² */
#define _zy2   r3   /* _zy² */
#define _cx    r4   /* Re(c) */
#define _cy    r5   /* Im(c) */

_quadratic_iteration_32:
	/* Initialize zx and zy from arguments, r6/r7 from stack. Compute the
	   initial values of zx² and zy². Start the loop pipeline by preparing
	   the multiplication zx⋅zy. */

	mov.l	r8, @-r15
	dmuls.l	r6, r6

	mov.l	r9, @-r15
	mov	r6, _zx

	sts	mach, r8
	mov	r7, _zy

	sts	macl, _zx2
	dmuls.l	r7, r7

	mov.l	@(8,r15), r6
	xtrct	r8, _zx2

	sts	mach, r9
	nop

	sts	macl, _zy2
	dmuls.l	_zx, _zy

	mov.l	@(12,r15), r7
	xtrct	r9, _zy2

.loop:
	/* We want to compute z²+c:
	     (zx + i·zy)(zx + i·zy) + (cx + i⋅cy)
	     = (zx² - zy² + cx) + i(2⋅zx⋅zy + cy)
	   Due to pipelining, zx * zy is currently being computed in MAC. */

	/* Update zx while retrieving zx⋅zy and preparing the next zx² */
	sts	mach, r8
	sub	_zy2, _zx2

	sts	macl, _zy
	add	_cx, _zx2

	mov	_zx2, _zx
	dmuls.l	_zx, _zx

	/* Update zy and zx² while preparing the next zy² */
	xtrct	r8, _zy
	sts	mach, r8

	shll	_zy
	sts	macl, _zx2

	add	_cy, _zy

	dmuls.l	r1, r1
	xtrct	r8, r2

	/* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|²
	   to t² and return if the threshold is reached. */
	sts	mach, r9
	sts	macl, _zy2
	add	r9, r8
	cmp/ge	r6, r8
	bt.s	.end
	dt	r7

	/* Start zx⋅zy early for next frame. */
	dmuls.l	_zx, _zy

	/* Continue looping */
	bf.s	.loop
	xtrct	r9, _zy2

.end:
	mov.l	@r15+, r9
	mov.l	@r15+, r8
	rts
	mov	r7, r0