102 lines
2.2 KiB
ArmAsm
102 lines
2.2 KiB
ArmAsm
/* Iteration of a quadratic function z -> z²+c on complex inputs. This version
|
|
is by Lephe (2023) and fairly optimized.
|
|
|
|
Notes for optimizers:
|
|
- The entire loop is an obvious EX chain, the only stuff that parallelizes
|
|
are the sts and the branches.
|
|
- I gained 15% speed from the original by getting as many sts to parallelize
|
|
as possible, which required tightening up dmuls.l -> delay -> sts chains.
|
|
- The code is strongly affected by alignment, is somehow very sensitive to
|
|
the initialization code, and does not behave as I expect it to in most
|
|
ways. Probably none of it executes as I envision. */
|
|
|
|
.global _quadratic_iteration_32
|
|
.text
|
|
.balign 4
|
|
|
|
#define _zx r0 /* Re(z) */
|
|
#define _zy r1 /* Im(z) */
|
|
#define _zx2 r2 /* _zx² */
|
|
#define _zy2 r3 /* _zy² */
|
|
#define _cx r4 /* Re(c) */
|
|
#define _cy r5 /* Im(c) */
|
|
|
|
_quadratic_iteration_32:
|
|
/* Initialize zx and zy from arguments, r6/r7 from stack. Compute the
|
|
initial values of zx² and zy². Start the loop pipeline by preparing
|
|
the multiplication zx⋅zy. */
|
|
|
|
mov.l r8, @-r15
|
|
dmuls.l r6, r6
|
|
|
|
mov.l r9, @-r15
|
|
mov r6, _zx
|
|
|
|
sts mach, r8
|
|
mov r7, _zy
|
|
|
|
sts macl, _zx2
|
|
dmuls.l r7, r7
|
|
|
|
mov.l @(8,r15), r6
|
|
xtrct r8, _zx2
|
|
|
|
sts mach, r9
|
|
nop
|
|
|
|
sts macl, _zy2
|
|
dmuls.l _zx, _zy
|
|
|
|
mov.l @(12,r15), r7
|
|
xtrct r9, _zy2
|
|
|
|
.loop:
|
|
/* We want to compute z²+c:
|
|
(zx + i·zy)(zx + i·zy) + (cx + i⋅cy)
|
|
= (zx² - zy² + cx) + i(2⋅zx⋅zy + cy)
|
|
Due to pipelining, zx * zy is currently being computed in MAC. */
|
|
|
|
/* Update zx while retrieving zx⋅zy and preparing the next zx² */
|
|
sts mach, r8
|
|
sub _zy2, _zx2
|
|
|
|
sts macl, _zy
|
|
add _cx, _zx2
|
|
|
|
mov _zx2, _zx
|
|
dmuls.l _zx, _zx
|
|
|
|
/* Update zy and zx² while preparing the next zy² */
|
|
xtrct r8, _zy
|
|
sts mach, r8
|
|
|
|
shll _zy
|
|
sts macl, _zx2
|
|
|
|
add _cy, _zy
|
|
|
|
dmuls.l r1, r1
|
|
xtrct r8, r2
|
|
|
|
/* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|²
|
|
to t² and return if the threshold is reached. */
|
|
sts mach, r9
|
|
sts macl, _zy2
|
|
add r9, r8
|
|
cmp/ge r6, r8
|
|
bt.s .end
|
|
dt r7
|
|
|
|
/* Start zx⋅zy early for next frame. */
|
|
dmuls.l _zx, _zy
|
|
|
|
/* Continue looping */
|
|
bf.s .loop
|
|
xtrct r9, _zy2
|
|
|
|
.end:
|
|
mov.l @r15+, r9
|
|
mov.l @r15+, r8
|
|
rts
|
|
mov r7, r0
|