improve speed of z->z²+c iteration and also parameterize z

This commit is contained in:
Lephenixnoir 2023-01-19 09:20:09 +01:00
parent 768028d45f
commit 07964396e0
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
6 changed files with 132 additions and 108 deletions

View File

@ -13,9 +13,8 @@ find_package(LibProf 2.4 REQUIRED)
set(SOURCES
src/main.cpp
src/mandelbrot.s
src/iteration.S
src/mandelbrotshader.cpp
src/julia.s
src/juliashader.cpp
src/utilities.cpp
# ...

101
src/iteration.S Normal file
View File

@ -0,0 +1,101 @@
/* Iteration of a quadratic function z -> z²+c on complex inputs. This version
is by Lephe (2023) and fairly optimized.
Notes for optimizers:
- The entire loop is an obvious EX chain, the only stuff that parallelizes
are the sts and the branches.
- I gained 15% speed from the original by getting as many sts to parallelize
as possible, which required tightening up dmuls.l -> delay -> sts chains.
- The code is strongly affected by alignment, is somehow very sensitive to
the initialization code, and does not behave as I expect it to in most
ways. Probably none of it executes as I envision. */
.global _quadratic_iteration_32
.text
.balign 4
#define _zx r0 /* Re(z) */
#define _zy r1 /* Im(z) */
#define _zx2 r2 /* _zx² */
#define _zy2 r3 /* _zy² */
#define _cx r4 /* Re(c) */
#define _cy r5 /* Im(c) */
_quadratic_iteration_32:
/* Initialize zx and zy from arguments, r6/r7 from stack. Compute the
initial values of zx² and zy². Start the loop pipeline by preparing
the multiplication zxzy. */
mov.l r8, @-r15
dmuls.l r6, r6
mov.l r9, @-r15
mov r6, _zx
sts mach, r8
mov r7, _zy
sts macl, _zx2
dmuls.l r7, r7
mov.l @(8,r15), r6
xtrct r8, _zx2
sts mach, r9
nop
sts macl, _zy2
dmuls.l _zx, _zy
mov.l @(12,r15), r7
xtrct r9, _zy2
.loop:
/* We want to compute z²+c:
(zx + i·zy)(zx + i·zy) + (cx + icy)
= (zx² - zy² + cx) + i(2zxzy + cy)
Due to pipelining, zx * zy is currently being computed in MAC. */
/* Update zx while retrieving zx⋅zy and preparing the next zx² */
sts mach, r8
sub _zy2, _zx2
sts macl, _zy
add _cx, _zx2
mov _zx2, _zx
dmuls.l _zx, _zx
/* Update zy and zx² while preparing the next zy² */
xtrct r8, _zy
sts mach, r8
shll _zy
sts macl, _zx2
add _cy, _zy
dmuls.l r1, r1
xtrct r8, r2
/* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|²
to t² and return if the threshold is reached. */
sts mach, r9
sts macl, _zy2
add r9, r8
cmp/ge r6, r8
bt.s .end
dt r7
/* Start zx⋅zy early for next frame. */
dmuls.l _zx, _zy
/* Continue looping */
bf.s .loop
xtrct r9, _zy2
.end:
mov.l @r15+, r9
mov.l @r15+, r8
rts
mov r7, r0

28
src/iteration.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef ITERATION_H
#define ITERATION_H
#ifdef __cplusplus
extern "C" {
#endif
/* Quadratic iteration. This function iterates z -> z²+c for z, c complex
inputs, until a fixed number of steps is reached or |z| becomes larger than
a predefined threshold t. Computation is carried in 16:16 fixed-point.
@Re_c @Im_c Constant c [num32]
@Re_z @Im_z Initial value of z [num32]
@t_squared Squared threshold t² [num32]
@steps Maximum number of steps [int]
Returns the number of steps remaining at the time the threshold was crossed,
or 0 if it was never reached. */
int quadratic_iteration_32(
int32_t Re_c, int32_t Im_c,
int32_t Re_z, int32_t Im_z,
uint32_t t_squared, int steps);
#ifdef __cplusplus
}
#endif
#endif /* ITERATION_H */

View File

@ -1,31 +0,0 @@
/*
int julia(int32_t cr, int32_t ci, uint32_t t_squared, int steps, int32_t zr, int32_t zi);
(Pour moi : t_squared = (2 * 2) << 16, steps = 50)
Si renvoie 0 -> dans la fractale
Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé)
*/
.global _julia
.text
/* Iteration of z = z²+c with threshold checks
@r4 Real part of c, cr
@r5 Imaginary part of c, ci
@r6 Squared threshold t²
@r7 Number of steps
If c is in the set, returns 0. Otherwise, returns the remaining number of
iterations to be checked before the process finised. */
_julia:
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
mov.l r11, @-r15
.loop:
.end:
rts
nop

View File

@ -1,74 +0,0 @@
/*
int mandelbrot(int32_t cr, int32_t ci, uint32_t t_squared, int steps);
(Pour moi : t_squared = (2 * 2) << 16, steps = 50)
Si renvoie 0 -> dans la fractale
Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé)
*/
.global _mandelbrot
.text
/* Iteration of z = z²+c with threshold checks
@r4 Real part of c, cr
@r5 Imaginary part of c, ci
@r6 Squared threshold t²
@r7 Number of steps
If c is in the set, returns 0. Otherwise, returns the remaining number of
iterations to be checked before the process finised. */
_mandelbrot:
mov.l r8, @-r15
mov.l r9, @-r15
/* Start with z=z²=0 (z=x+iy) */
mov #0, r0
mov #0, r1
mov #0, r2
mov #0, r3
.loop:
/* Start xy */
dmuls.l r0, r1
/* Update x = x²-y²+cr */
mov r2, r0
sub r3, r0
add r4, r0
/* Get y = 2xy+ci while starting x²
TODO: Hope there's no overflow in that [shll r1]. It might be
TODO: provable because 2xy overflowing probably implies a lower
TODO: bound on x²+y² at the previous iteration. */
sts mach, r8
sts macl, r1
dmuls.l r0, r0
xtrct r8, r1
shll r1
add r5, r1
/* Get x² and keep the integer part at full precision for z²; at the
same time, start y² */
sts mach, r8
sts macl, r2
dmuls.l r1, r1
xtrct r8, r2
/* Get y² and compute ∥z∥² = x²+y² in 32:0 format */
sts mach, r9
sts macl, r3
add r9, r8
xtrct r9, r3
/* Compare ∥z∥² to t², return if the threshold is reached */
cmp/ge r6, r8
bt .end
/* Continue looping */
dt r7
bf .loop
.end:
mov.l @r15+, r9
mov.l @r15+, r8
rts
mov r7, r0

View File

@ -1,5 +1,6 @@
#include <azur/gint/render.h>
#include "fractalshaders.h"
#include "iteration.h"
#include <cstdlib>
#include <cstdio>
@ -110,7 +111,7 @@ void azrp_shader_mandelbrot( void *uniforms, void *comnd, void *fragment )
zr = zrsave;
zi = zisave;
u = mandelbrot( zr.v, zi.v, (2*2)<<16, maxiter );
u = quadratic_iteration_32( zr.v, zi.v, 0, 0, (2*2)<<16, maxiter );
if (u==0) frag[offset + i] = C_RGB(0,0,0);
else