/* Iteration of a quadratic function z -> z²+c on complex inputs. This version is by Lephe (2023) and fairly optimized. Notes for optimizers: - The entire loop is an obvious EX chain, the only stuff that parallelizes are the sts and the branches. - I gained 15% speed from the original by getting as many sts to parallelize as possible, which required tightening up dmuls.l -> delay -> sts chains. - The code is strongly affected by alignment, is somehow very sensitive to the initialization code, and does not behave as I expect it to in most ways. Probably none of it executes as I envision. */ .global _quadratic_iteration_32 .text .balign 4 #define _zx r0 /* Re(z) */ #define _zy r1 /* Im(z) */ #define _zx2 r2 /* _zx² */ #define _zy2 r3 /* _zy² */ #define _cx r4 /* Re(c) */ #define _cy r5 /* Im(c) */ _quadratic_iteration_32: /* Initialize zx and zy from arguments, r6/r7 from stack. Compute the initial values of zx² and zy². Start the loop pipeline by preparing the multiplication zx⋅zy. */ mov.l r8, @-r15 dmuls.l r6, r6 mov.l r9, @-r15 mov r6, _zx sts mach, r8 mov r7, _zy sts macl, _zx2 dmuls.l r7, r7 mov.l @(8,r15), r6 xtrct r8, _zx2 sts mach, r9 nop sts macl, _zy2 dmuls.l _zx, _zy mov.l @(12,r15), r7 xtrct r9, _zy2 .loop: /* We want to compute z²+c: (zx + i·zy)(zx + i·zy) + (cx + i⋅cy) = (zx² - zy² + cx) + i(2⋅zx⋅zy + cy) Due to pipelining, zx * zy is currently being computed in MAC. */ /* Update zx while retrieving zx⋅zy and preparing the next zx² */ sts mach, r8 sub _zy2, _zx2 sts macl, _zy add _cx, _zx2 mov _zx2, _zx dmuls.l _zx, _zx /* Update zy and zx² while preparing the next zy² */ xtrct r8, _zy sts mach, r8 shll _zy sts macl, _zx2 add _cy, _zy dmuls.l r1, r1 xtrct r8, r2 /* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|² to t² and return if the threshold is reached. */ sts mach, r9 sts macl, _zy2 add r9, r8 cmp/ge r6, r8 bt.s .end dt r7 /* Start zx⋅zy early for next frame. */ dmuls.l _zx, _zy /* Continue looping */ bf.s .loop xtrct r9, _zy2 .end: mov.l @r15+, r9 mov.l @r15+, r8 rts mov r7, r0