MandAzur/src/iteration.S

102 lines
2.2 KiB
ArmAsm

/* Iteration of a quadratic function z -> z²+c on complex inputs. This version
is by Lephe (2023) and fairly optimized.
Notes for optimizers:
- The entire loop is an obvious EX chain, the only stuff that parallelizes
are the sts and the branches.
- I gained 15% speed from the original by getting as many sts to parallelize
as possible, which required tightening up dmuls.l -> delay -> sts chains.
- The code is strongly affected by alignment, is somehow very sensitive to
the initialization code, and does not behave as I expect it to in most
ways. Probably none of it executes as I envision. */
.global _quadratic_iteration_32
.text
.balign 4
#define _zx r0 /* Re(z) */
#define _zy r1 /* Im(z) */
#define _zx2 r2 /* _zx² */
#define _zy2 r3 /* _zy² */
#define _cx r4 /* Re(c) */
#define _cy r5 /* Im(c) */
_quadratic_iteration_32:
/* Initialize zx and zy from arguments, r6/r7 from stack. Compute the
initial values of zx² and zy². Start the loop pipeline by preparing
the multiplication zxzy. */
mov.l r8, @-r15
dmuls.l r6, r6
mov.l r9, @-r15
mov r6, _zx
sts mach, r8
mov r7, _zy
sts macl, _zx2
dmuls.l r7, r7
mov.l @(8,r15), r6
xtrct r8, _zx2
sts mach, r9
nop
sts macl, _zy2
dmuls.l _zx, _zy
mov.l @(12,r15), r7
xtrct r9, _zy2
.loop:
/* We want to compute z²+c:
(zx + i·zy)(zx + i·zy) + (cx + icy)
= (zx² - zy² + cx) + i(2zxzy + cy)
Due to pipelining, zx * zy is currently being computed in MAC. */
/* Update zx while retrieving zx⋅zy and preparing the next zx² */
sts mach, r8
sub _zy2, _zx2
sts macl, _zy
add _cx, _zx2
mov _zx2, _zx
dmuls.l _zx, _zx
/* Update zy and zx² while preparing the next zy² */
xtrct r8, _zy
sts mach, r8
shll _zy
sts macl, _zx2
add _cy, _zy
dmuls.l r1, r1
xtrct r8, r2
/* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|²
to t² and return if the threshold is reached. */
sts mach, r9
sts macl, _zy2
add r9, r8
cmp/ge r6, r8
bt.s .end
dt r7
/* Start zx⋅zy early for next frame. */
dmuls.l _zx, _zy
/* Continue looping */
bf.s .loop
xtrct r9, _zy2
.end:
mov.l @r15+, r9
mov.l @r15+, r8
rts
mov r7, r0