clip floor, avoiding inner loop checks - ASM now slower

I'm guessing the function is short that not inlining and having to pass
paremeters is breaking the deal. I expect this to revert back when we
switch to Azur, handle entire fragments at once, and use a denser
branch-free texture-lookup inner loop.
This commit is contained in:
Lephenixnoir 2024-03-15 08:48:49 +01:00
parent 15f65e724c
commit 45396c599e
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
5 changed files with 183 additions and 13 deletions

View File

@ -2,7 +2,7 @@
# toolchain file and module path of the fxSDK
cmake_minimum_required(VERSION 3.15)
project(MyAddin LANGUAGES C CXX)
project(MyAddin LANGUAGES C CXX ASM)
include(GenerateG1A)
include(GenerateG3A)
@ -15,6 +15,7 @@ set(CMAKE_CXX_STANDARD 20)
set(SOURCES
src/azuray.cc
src/azuray.S
src/main.cc)
set(ASSETS
assets/map1.txt)

View File

@ -1,7 +1,7 @@
#########
## ######
# # #
###
#
# #
## ##
#########
###### ##

113
src/azuray.S Normal file
View File

@ -0,0 +1,113 @@
.global _azuray_floor_line
# Parameters:
# vram, floor_x, floor_z, ortho_dir.x
# ortho_dir.z, xlen, 0, color1, color2
# Register allocation:
# r0: (temp)
# r1: (temp)
# r2: color1
# r3: color2
# r4: vram
# r5: floor_x
# r6: floor_z
# r7: ortho_dir.x
# r8: ortho_dir.z
# r9: xlen
# r10: _
# r11: -16 (for num -> int conversion)
# r12: loop counter
_azuray_floor_line:
mov.l r8, @-r15
nop
mov.l r9, @-r15
nop
mov.l r10, @-r15
nop
mov.l @(12, r15), r8
nop
mov.l @(16, r15), r9
nop
mov.l @(20, r15), r10
nop
mov.l @(24, r15), r2
nop
mov.l @(28, r15), r3
nop
mov.l r11, @-r15
mov #-16, r11
# ---
1: mov r5, r0
xor r6, r0
mov r2, r1
shad r11, r0
shlr r0
nop
bf.s 3f
add r7, r5
mov r3, r1
nop
3: mov.w r1, @r4
dt r9
/* Sneaky vram++ */
mov.w @r4+, r0
nop
bf.s 1b
add r8, r6
# ---
bra 4f
nop
# --- alternative loop ending ---
2: dt r9
nop
/* Sneaky vram++ */
add r7, r5
mov.w @r4+, r0
bf.s 1b
add r8, r6
# ---
4: mov.l @r15+, r11
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
/*
for(int x = 0; x < DWIDTH; x++) {
int wx = floor_x.ifloor();
int wz = floor_z.ifloor();
if((uint)wx < MI.width() && (uint)wz < MI.height()) {
*vram = (wx ^ wz) & 1 ? color2 : color1;
}
floor_x += ortho_dir.x;
floor_z += ortho_dir.z;
vram++;
}
*/

View File

@ -45,6 +45,9 @@ bool raycast(MapInterface &MI, vec3 start, vec3 end, vec3 *collision,
int current_x = num_ifloor_along(x, u.x);
int current_z = num_ifloor_along(z, u.z);
if((uint)current_x > MI.width() || (uint)current_z > MI.height())
break;
/* Distance to the next horizontal, and vertical line */
num dist_z = (u.z >= 0) ? num(1) - z.frac() : -num_frac_roundup(z);
num dist_x = (u.x >= 0) ? num(1) - x.frac() : -num_frac_roundup(x);
@ -173,24 +176,69 @@ void render_floor(MapInterface &MI, vec3 pos, vec3 dir, num depth)
screen position for one line, because depth is constant. */
vec3 ortho_dir(dir.z, 0, -dir.x);
/* Apply depth factor */
num depthFactor = dist / num(DWIDTH / 2 * depth);
ortho_dir *= depthFactor;
ortho_dir *= dist / num(DWIDTH / 2 * depth);
vec3 floorPos = floor + (-DWIDTH/2) * ortho_dir;
num floor_x = floor.x + (-DWIDTH/2) * ortho_dir.x;
num floor_z = floor.z + (-DWIDTH/2) * ortho_dir.z;
num floor_x_end = floor.x + (DWIDTH/2) * ortho_dir.x;
num floor_z_end = floor.z + (DWIDTH/2) * ortho_dir.z;
uint16_t *vram = gint_vram + y * DWIDTH;
int color1 = C_RGB(15, 12, 4);
int color2 = C_RGB(12, 8, 2);
for(int x = 0; x < DWIDTH; x++) {
int wx = floorPos.x.ifloor();
int wz = floorPos.z.ifloor();
/* Determine the intersection between display and map for this line */
int xmin = 0;
int xmax = DWIDTH-1;
if((uint)wx < MI.width() && (uint)wz < MI.height()) {
*vram = (wx ^ wz) & 1 ? color2 : color1;
}
if(ortho_dir.x > 0) {
if(floor_x < 0)
xmin = max((i32)xmin, -floor_x.v / ortho_dir.x.v + 1);
num Mx = floor_x_end - num((int)MI.width());
if(Mx > 0)
xmax = min((i32)xmax, DWIDTH - 1 - Mx.v / ortho_dir.x.v);
}
else {
if(floor_x_end < 0)
xmax = min((i32)xmax, DWIDTH - 1 - floor_x_end.v / ortho_dir.x.v + 1);
num mx = num((int)MI.width()) - floor_x;
if(mx < 0)
xmin = max((i32)xmin, mx.v / ortho_dir.x.v);
}
floorPos += ortho_dir;
if(ortho_dir.z > 0) {
if(floor_z < 0)
xmin = max((i32)xmin, -floor_z.v / ortho_dir.z.v + 1);
num Mz = floor_z_end - num((int)MI.height());
if(Mz > 0)
xmax = min((i32)xmax, DWIDTH - 1 - Mz.v / ortho_dir.z.v);
}
else {
if(floor_z_end < 0)
xmax = min((i32)xmax, DWIDTH - 1 - floor_z_end.v / ortho_dir.z.v + 1);
num mz = num((int)MI.height()) - floor_z;
if(mz < 0)
xmin = max((i32)xmin, mz.v / ortho_dir.z.v);
}
if(xmin) {
vram += xmin;
floor_x += xmin * ortho_dir.x;
floor_z += xmin * ortho_dir.z;
}
/* if(xmin <= xmax)
azuray_floor_line(vram, floor_x, floor_z, ortho_dir.x, ortho_dir.z,
xmax-xmin+1, 0, color1, color2);
*/
for(int x = xmin; x <= xmax; x++) {
int wx = floor_x.ifloor();
int wz = floor_z.ifloor();
*vram = (wx ^ wz) & 1 ? color2 : color1;
floor_x += ortho_dir.x;
floor_z += ortho_dir.z;
vram++;
}
}

View File

@ -29,3 +29,11 @@ void render(MapInterface &MI, vec3 pos, vec3 dir, num depth);
void render_floor(MapInterface &MI, vec3 pos, vec3 dir, num depth);
} /* namespace azuray */
extern "C" {
void azuray_floor_line(uint16_t *vram, num floor_x, num floor_z,
num ortho_dir_x, num ortho_dir_z, int xlen, int, int color1,
int color2);
} /* extern "C" */