gint/src/render-cg/bopti-asm.s
lephe 2e17b77e56 bopti: first fxcg50 version with r5g6b5 and r5g6b5a
This commit introduces bopti for fx-CG 50. Currently the only
interfaces are the bopti_render_{clip,noclip} functions, and the
only supported formats are r5g6b5 and r5g6b5a.

The algorithm for r5g6b5 is optimized to perform longword accesses
using movua.l, whereas the algorithm for r5g6b5a uses plain word
accesses because transparency checks feel more difficult than one
more loop iteration.

These algorithms are still slow for large surfaces and struggle to
keep up 25 FPS in full-screen, so possible improvements with the
DMA should definitely be tested before restorting to overclock.
2019-08-04 13:59:35 +02:00

139 lines
1.7 KiB
ArmAsm

.global _bopti_r5g6b5
.global _bopti_r5g6b5a
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: (tmp)
# r2: width - 1
# r3: target & 2
# ---
# r4: data
# r5: target
# r6: width; then, the number of longword operations
# r7: height
# ---
# r8: in_stride
# r9: out_stride
# r10: x counter
# ---
# @12: in_stride
# @16: out_stride
.align 4
_bopti_r5g6b5:
# Target alignment, either 0 (4-aligned) or 2 (2-aligned)
mov.l r8, @-r15
mov r5, r3
mov.l r9, @-r15
mov #2, r0
mov.l r10, @-r15
and r0, r3
# width-1, used to copy the last longword
mov r6, r2
add #-1, r2
shll r2
# Number of longword operations per row
shlr r6
# Input and output strides, minus aligment
mov.l @(12, r15), r8
mov.l @(16, r15), r9
sub r3, r8
sub r3, r9
.r5g6b5_y:
# First longword
mov.w @r4, r1
mov r2, r0
mov.w r1, @r5
# Last longword; align to 4-byte boundaries for target
mov.w @(r0, r4), r1
add r3, r4
mov.w r1, @(r0, r5)
add r3, r5
mov r6, r10
.r5g6b5_x:
# Copy longwords
movua.l @r4+, r0
mov.l r0, @r5
dt r10
bf.s .r5g6b5_x
add #4, r5
# -
add r8, r4
dt r7
bf.s .r5g6b5_y
add r9, r5
# -
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: in_stride
# r2: out_stride
# r3: x counter
# ---
# r4: data
# r5: target
# r6: width
# r7: height
# ---
# r8: alpha
# ---
# @4: in_stride
# @8: out_stride
# @12: alpha
.align 4
_bopti_r5g6b5a:
# Load alpha value
mov.l r8, @-r15
mov.l @(12, r15), r8
# Load input and output strides
mov.l @(4, r15), r1
mov.l @(8, r15), r2
.r5g6b5a_y:
mov r6, r3
.r5g6b5a_x:
mov.w @r4+, r0
cmp/eq r0, r8
bt .r5g6b5a_alpha
mov.w r0, @r5
.r5g6b5a_alpha:
dt r3
bf.s .r5g6b5a_x
add #2, r5
# -
add r1, r4
dt r7
bf.s .r5g6b5a_y
add r2, r5
# -
rts
mov.l @r15+, r8