Improve speed of z->z²+c iteration and parameterize z #1
|
@ -13,9 +13,8 @@ find_package(LibProf 2.4 REQUIRED)
|
|||
|
||||
set(SOURCES
|
||||
src/main.cpp
|
||||
src/mandelbrot.s
|
||||
src/iteration.S
|
||||
src/mandelbrotshader.cpp
|
||||
src/julia.s
|
||||
src/juliashader.cpp
|
||||
src/utilities.cpp
|
||||
# ...
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
/* Iteration of a quadratic function z -> z²+c on complex inputs. This version
|
||||
is by Lephe (2023) and fairly optimized.
|
||||
|
||||
Notes for optimizers:
|
||||
- The entire loop is an obvious EX chain, the only stuff that parallelizes
|
||||
are the sts and the branches.
|
||||
- I gained 15% speed from the original by getting as many sts to parallelize
|
||||
as possible, which required tightening up dmuls.l -> delay -> sts chains.
|
||||
- The code is strongly affected by alignment, is somehow very sensitive to
|
||||
the initialization code, and does not behave as I expect it to in most
|
||||
ways. Probably none of it executes as I envision. */
|
||||
|
||||
.global _quadratic_iteration_32
|
||||
.text
|
||||
.balign 4
|
||||
|
||||
#define _zx r0 /* Re(z) */
|
||||
#define _zy r1 /* Im(z) */
|
||||
#define _zx2 r2 /* _zx² */
|
||||
#define _zy2 r3 /* _zy² */
|
||||
#define _cx r4 /* Re(c) */
|
||||
#define _cy r5 /* Im(c) */
|
||||
|
||||
_quadratic_iteration_32:
|
||||
/* Initialize zx and zy from arguments, r6/r7 from stack. Compute the
|
||||
initial values of zx² and zy². Start the loop pipeline by preparing
|
||||
the multiplication zx⋅zy. */
|
||||
|
||||
mov.l r8, @-r15
|
||||
dmuls.l r6, r6
|
||||
|
||||
mov.l r9, @-r15
|
||||
mov r6, _zx
|
||||
|
||||
sts mach, r8
|
||||
mov r7, _zy
|
||||
|
||||
sts macl, _zx2
|
||||
dmuls.l r7, r7
|
||||
|
||||
mov.l @(8,r15), r6
|
||||
xtrct r8, _zx2
|
||||
|
||||
sts mach, r9
|
||||
nop
|
||||
|
||||
sts macl, _zy2
|
||||
dmuls.l _zx, _zy
|
||||
|
||||
mov.l @(12,r15), r7
|
||||
xtrct r9, _zy2
|
||||
|
||||
.loop:
|
||||
/* We want to compute z²+c:
|
||||
(zx + i·zy)(zx + i·zy) + (cx + i⋅cy)
|
||||
= (zx² - zy² + cx) + i(2⋅zx⋅zy + cy)
|
||||
Due to pipelining, zx * zy is currently being computed in MAC. */
|
||||
|
||||
/* Update zx while retrieving zx⋅zy and preparing the next zx² */
|
||||
sts mach, r8
|
||||
sub _zy2, _zx2
|
||||
|
||||
sts macl, _zy
|
||||
add _cx, _zx2
|
||||
|
||||
mov _zx2, _zx
|
||||
dmuls.l _zx, _zx
|
||||
|
||||
/* Update zy and zx² while preparing the next zy² */
|
||||
xtrct r8, _zy
|
||||
sts mach, r8
|
||||
|
||||
shll _zy
|
||||
sts macl, _zx2
|
||||
|
||||
add _cy, _zy
|
||||
|
||||
dmuls.l r1, r1
|
||||
xtrct r8, r2
|
||||
|
||||
/* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|²
|
||||
to t² and return if the threshold is reached. */
|
||||
sts mach, r9
|
||||
sts macl, _zy2
|
||||
add r9, r8
|
||||
cmp/ge r6, r8
|
||||
bt.s .end
|
||||
dt r7
|
||||
|
||||
/* Start zx⋅zy early for next frame. */
|
||||
dmuls.l _zx, _zy
|
||||
|
||||
/* Continue looping */
|
||||
bf.s .loop
|
||||
xtrct r9, _zy2
|
||||
|
||||
.end:
|
||||
mov.l @r15+, r9
|
||||
mov.l @r15+, r8
|
||||
rts
|
||||
mov r7, r0
|
|
@ -0,0 +1,28 @@
|
|||
#ifndef ITERATION_H
|
||||
#define ITERATION_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Quadratic iteration. This function iterates z -> z²+c for z, c complex
|
||||
inputs, until a fixed number of steps is reached or |z| becomes larger than
|
||||
a predefined threshold t. Computation is carried in 16:16 fixed-point.
|
||||
|
||||
@Re_c @Im_c Constant c [num32]
|
||||
@Re_z @Im_z Initial value of z [num32]
|
||||
@t_squared Squared threshold t² [num32]
|
||||
@steps Maximum number of steps [int]
|
||||
|
||||
Returns the number of steps remaining at the time the threshold was crossed,
|
||||
or 0 if it was never reached. */
|
||||
int quadratic_iteration_32(
|
||||
int32_t Re_c, int32_t Im_c,
|
||||
int32_t Re_z, int32_t Im_z,
|
||||
uint32_t t_squared, int steps);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ITERATION_H */
|
31
src/julia.s
31
src/julia.s
|
@ -1,31 +0,0 @@
|
|||
|
||||
/*
|
||||
int julia(int32_t cr, int32_t ci, uint32_t t_squared, int steps, int32_t zr, int32_t zi);
|
||||
(Pour moi : t_squared = (2 * 2) << 16, steps = 50)
|
||||
Si renvoie 0 -> dans la fractale
|
||||
Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé)
|
||||
*/
|
||||
|
||||
.global _julia
|
||||
.text
|
||||
|
||||
/* Iteration of z = z²+c with threshold checks
|
||||
@r4 Real part of c, cr
|
||||
@r5 Imaginary part of c, ci
|
||||
@r6 Squared threshold t²
|
||||
@r7 Number of steps
|
||||
If c is in the set, returns 0. Otherwise, returns the remaining number of
|
||||
iterations to be checked before the process finised. */
|
||||
|
||||
_julia:
|
||||
mov.l r8, @-r15
|
||||
mov.l r9, @-r15
|
||||
mov.l r10, @-r15
|
||||
mov.l r11, @-r15
|
||||
|
||||
.loop:
|
||||
|
||||
.end:
|
||||
|
||||
rts
|
||||
nop
|
|
@ -1,74 +0,0 @@
|
|||
|
||||
/*
|
||||
int mandelbrot(int32_t cr, int32_t ci, uint32_t t_squared, int steps);
|
||||
(Pour moi : t_squared = (2 * 2) << 16, steps = 50)
|
||||
Si renvoie 0 -> dans la fractale
|
||||
Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé)
|
||||
*/
|
||||
|
||||
.global _mandelbrot
|
||||
.text
|
||||
|
||||
/* Iteration of z = z²+c with threshold checks
|
||||
@r4 Real part of c, cr
|
||||
@r5 Imaginary part of c, ci
|
||||
@r6 Squared threshold t²
|
||||
@r7 Number of steps
|
||||
If c is in the set, returns 0. Otherwise, returns the remaining number of
|
||||
iterations to be checked before the process finised. */
|
||||
_mandelbrot:
|
||||
mov.l r8, @-r15
|
||||
mov.l r9, @-r15
|
||||
|
||||
/* Start with z=z²=0 (z=x+iy) */
|
||||
mov #0, r0
|
||||
mov #0, r1
|
||||
mov #0, r2
|
||||
mov #0, r3
|
||||
|
||||
.loop:
|
||||
/* Start xy */
|
||||
dmuls.l r0, r1
|
||||
|
||||
/* Update x = x²-y²+cr */
|
||||
mov r2, r0
|
||||
sub r3, r0
|
||||
add r4, r0
|
||||
|
||||
/* Get y = 2xy+ci while starting x²
|
||||
TODO: Hope there's no overflow in that [shll r1]. It might be
|
||||
TODO: provable because 2xy overflowing probably implies a lower
|
||||
TODO: bound on x²+y² at the previous iteration. */
|
||||
sts mach, r8
|
||||
sts macl, r1
|
||||
dmuls.l r0, r0
|
||||
xtrct r8, r1
|
||||
shll r1
|
||||
add r5, r1
|
||||
|
||||
/* Get x² and keep the integer part at full precision for ∥z∥²; at the
|
||||
same time, start y² */
|
||||
sts mach, r8
|
||||
sts macl, r2
|
||||
dmuls.l r1, r1
|
||||
xtrct r8, r2
|
||||
|
||||
/* Get y² and compute ∥z∥² = x²+y² in 32:0 format */
|
||||
sts mach, r9
|
||||
sts macl, r3
|
||||
add r9, r8
|
||||
xtrct r9, r3
|
||||
|
||||
/* Compare ∥z∥² to t², return if the threshold is reached */
|
||||
cmp/ge r6, r8
|
||||
bt .end
|
||||
|
||||
/* Continue looping */
|
||||
dt r7
|
||||
bf .loop
|
||||
|
||||
.end:
|
||||
mov.l @r15+, r9
|
||||
mov.l @r15+, r8
|
||||
rts
|
||||
mov r7, r0
|
|
@ -1,5 +1,6 @@
|
|||
#include <azur/gint/render.h>
|
||||
#include "fractalshaders.h"
|
||||
#include "iteration.h"
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
|
||||
|
@ -110,7 +111,7 @@ void azrp_shader_mandelbrot( void *uniforms, void *comnd, void *fragment )
|
|||
zr = zrsave;
|
||||
zi = zisave;
|
||||
|
||||
u = mandelbrot( zr.v, zi.v, (2*2)<<16, maxiter );
|
||||
u = quadratic_iteration_32( zr.v, zi.v, 0, 0, (2*2)<<16, maxiter );
|
||||
|
||||
if (u==0) frag[offset + i] = C_RGB(0,0,0);
|
||||
else
|
||||
|
|
Loading…
Reference in New Issue