From 07964396e091ab199e2622c2a541c92e583675d8 Mon Sep 17 00:00:00 2001 From: Lephenixnoir Date: Thu, 19 Jan 2023 09:20:09 +0100 Subject: [PATCH] =?UTF-8?q?improve=20speed=20of=20z->z=C2=B2+c=20iteration?= =?UTF-8?q?=20and=20also=20parameterize=20z?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 3 +- src/iteration.S | 101 +++++++++++++++++++++++++++++++++++++++ src/iteration.h | 28 +++++++++++ src/julia.s | 31 ------------ src/mandelbrot.s | 74 ---------------------------- src/mandelbrotshader.cpp | 3 +- 6 files changed, 132 insertions(+), 108 deletions(-) create mode 100644 src/iteration.S create mode 100644 src/iteration.h delete mode 100644 src/julia.s delete mode 100644 src/mandelbrot.s diff --git a/CMakeLists.txt b/CMakeLists.txt index 0a8f2e4..8007086 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,9 +13,8 @@ find_package(LibProf 2.4 REQUIRED) set(SOURCES src/main.cpp - src/mandelbrot.s + src/iteration.S src/mandelbrotshader.cpp - src/julia.s src/juliashader.cpp src/utilities.cpp # ... diff --git a/src/iteration.S b/src/iteration.S new file mode 100644 index 0000000..f284837 --- /dev/null +++ b/src/iteration.S @@ -0,0 +1,101 @@ +/* Iteration of a quadratic function z -> z²+c on complex inputs. This version + is by Lephe (2023) and fairly optimized. + + Notes for optimizers: + - The entire loop is an obvious EX chain, the only stuff that parallelizes + are the sts and the branches. + - I gained 15% speed from the original by getting as many sts to parallelize + as possible, which required tightening up dmuls.l -> delay -> sts chains. + - The code is strongly affected by alignment, is somehow very sensitive to + the initialization code, and does not behave as I expect it to in most + ways. Probably none of it executes as I envision. */ + +.global _quadratic_iteration_32 +.text +.balign 4 + +#define _zx r0 /* Re(z) */ +#define _zy r1 /* Im(z) */ +#define _zx2 r2 /* _zx² */ +#define _zy2 r3 /* _zy² */ +#define _cx r4 /* Re(c) */ +#define _cy r5 /* Im(c) */ + +_quadratic_iteration_32: + /* Initialize zx and zy from arguments, r6/r7 from stack. Compute the + initial values of zx² and zy². Start the loop pipeline by preparing + the multiplication zx⋅zy. */ + + mov.l r8, @-r15 + dmuls.l r6, r6 + + mov.l r9, @-r15 + mov r6, _zx + + sts mach, r8 + mov r7, _zy + + sts macl, _zx2 + dmuls.l r7, r7 + + mov.l @(8,r15), r6 + xtrct r8, _zx2 + + sts mach, r9 + nop + + sts macl, _zy2 + dmuls.l _zx, _zy + + mov.l @(12,r15), r7 + xtrct r9, _zy2 + +.loop: + /* We want to compute z²+c: + (zx + i·zy)(zx + i·zy) + (cx + i⋅cy) + = (zx² - zy² + cx) + i(2⋅zx⋅zy + cy) + Due to pipelining, zx * zy is currently being computed in MAC. */ + + /* Update zx while retrieving zx⋅zy and preparing the next zx² */ + sts mach, r8 + sub _zy2, _zx2 + + sts macl, _zy + add _cx, _zx2 + + mov _zx2, _zx + dmuls.l _zx, _zx + + /* Update zy and zx² while preparing the next zy² */ + xtrct r8, _zy + sts mach, r8 + + shll _zy + sts macl, _zx2 + + add _cy, _zy + + dmuls.l r1, r1 + xtrct r8, r2 + + /* Update zy² and compute |z|² = zx² + zy² in 32:0 format. Compare |z|² + to t² and return if the threshold is reached. */ + sts mach, r9 + sts macl, _zy2 + add r9, r8 + cmp/ge r6, r8 + bt.s .end + dt r7 + + /* Start zx⋅zy early for next frame. */ + dmuls.l _zx, _zy + + /* Continue looping */ + bf.s .loop + xtrct r9, _zy2 + +.end: + mov.l @r15+, r9 + mov.l @r15+, r8 + rts + mov r7, r0 diff --git a/src/iteration.h b/src/iteration.h new file mode 100644 index 0000000..3b3fc4b --- /dev/null +++ b/src/iteration.h @@ -0,0 +1,28 @@ +#ifndef ITERATION_H +#define ITERATION_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* Quadratic iteration. This function iterates z -> z²+c for z, c complex + inputs, until a fixed number of steps is reached or |z| becomes larger than + a predefined threshold t. Computation is carried in 16:16 fixed-point. + + @Re_c @Im_c Constant c [num32] + @Re_z @Im_z Initial value of z [num32] + @t_squared Squared threshold t² [num32] + @steps Maximum number of steps [int] + + Returns the number of steps remaining at the time the threshold was crossed, + or 0 if it was never reached. */ +int quadratic_iteration_32( + int32_t Re_c, int32_t Im_c, + int32_t Re_z, int32_t Im_z, + uint32_t t_squared, int steps); + +#ifdef __cplusplus +} +#endif + +#endif /* ITERATION_H */ diff --git a/src/julia.s b/src/julia.s deleted file mode 100644 index d96b5da..0000000 --- a/src/julia.s +++ /dev/null @@ -1,31 +0,0 @@ - -/* -int julia(int32_t cr, int32_t ci, uint32_t t_squared, int steps, int32_t zr, int32_t zi); -(Pour moi : t_squared = (2 * 2) << 16, steps = 50) -Si renvoie 0 -> dans la fractale -Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé) -*/ - -.global _julia -.text - -/* Iteration of z = z²+c with threshold checks - @r4 Real part of c, cr - @r5 Imaginary part of c, ci - @r6 Squared threshold t² - @r7 Number of steps - If c is in the set, returns 0. Otherwise, returns the remaining number of - iterations to be checked before the process finised. */ - -_julia: - mov.l r8, @-r15 - mov.l r9, @-r15 - mov.l r10, @-r15 - mov.l r11, @-r15 - -.loop: - -.end: - - rts - nop \ No newline at end of file diff --git a/src/mandelbrot.s b/src/mandelbrot.s deleted file mode 100644 index 826b0e3..0000000 --- a/src/mandelbrot.s +++ /dev/null @@ -1,74 +0,0 @@ - -/* -int mandelbrot(int32_t cr, int32_t ci, uint32_t t_squared, int steps); -(Pour moi : t_squared = (2 * 2) << 16, steps = 50) -Si renvoie 0 -> dans la fractale -Sinon renvoie un nombre entre 1...steps indiquant la "distance" (pour le dégradé) -*/ - -.global _mandelbrot -.text - -/* Iteration of z = z²+c with threshold checks - @r4 Real part of c, cr - @r5 Imaginary part of c, ci - @r6 Squared threshold t² - @r7 Number of steps - If c is in the set, returns 0. Otherwise, returns the remaining number of - iterations to be checked before the process finised. */ -_mandelbrot: - mov.l r8, @-r15 - mov.l r9, @-r15 - - /* Start with z=z²=0 (z=x+iy) */ - mov #0, r0 - mov #0, r1 - mov #0, r2 - mov #0, r3 - -.loop: - /* Start xy */ - dmuls.l r0, r1 - - /* Update x = x²-y²+cr */ - mov r2, r0 - sub r3, r0 - add r4, r0 - - /* Get y = 2xy+ci while starting x² - TODO: Hope there's no overflow in that [shll r1]. It might be - TODO: provable because 2xy overflowing probably implies a lower - TODO: bound on x²+y² at the previous iteration. */ - sts mach, r8 - sts macl, r1 - dmuls.l r0, r0 - xtrct r8, r1 - shll r1 - add r5, r1 - - /* Get x² and keep the integer part at full precision for ∥z∥²; at the - same time, start y² */ - sts mach, r8 - sts macl, r2 - dmuls.l r1, r1 - xtrct r8, r2 - - /* Get y² and compute ∥z∥² = x²+y² in 32:0 format */ - sts mach, r9 - sts macl, r3 - add r9, r8 - xtrct r9, r3 - - /* Compare ∥z∥² to t², return if the threshold is reached */ - cmp/ge r6, r8 - bt .end - - /* Continue looping */ - dt r7 - bf .loop - -.end: - mov.l @r15+, r9 - mov.l @r15+, r8 - rts - mov r7, r0 \ No newline at end of file diff --git a/src/mandelbrotshader.cpp b/src/mandelbrotshader.cpp index 177765a..4300ba6 100644 --- a/src/mandelbrotshader.cpp +++ b/src/mandelbrotshader.cpp @@ -1,5 +1,6 @@ #include #include "fractalshaders.h" +#include "iteration.h" #include #include @@ -110,7 +111,7 @@ void azrp_shader_mandelbrot( void *uniforms, void *comnd, void *fragment ) zr = zrsave; zi = zisave; - u = mandelbrot( zr.v, zi.v, (2*2)<<16, maxiter ); + u = quadratic_iteration_32( zr.v, zi.v, 0, 0, (2*2)<<16, maxiter ); if (u==0) frag[offset + i] = C_RGB(0,0,0); else