BosonX/src/fsblend.S

101 lines
1.7 KiB
ArmAsm

.global _bosonx_shader_fsblend
.align 4
#define _fg r2
#define _alpha r3 /* Opacity of the colored overlay (0..31) */
#define _N r4 /* Number of pixels in fragment */
#define _cmd r5 /* Command */
#define _frag r6 /* Fragment */
#define _mask r7 /* Pixel expansion mask */
/* Possible optimizations:
- Remove the huge sts -> EX delay by tiling. This is the only way. */
_bosonx_shader_fsblend:
/* Preload and duplicate the first pixel while we're setting up */
mov.w @_frag, r0
add #2, r5
mov.l r8, @-r15
add #-2, _frag
mov.w @r5+, _alpha /* alpha */
swap.w r0, r8
mov.l .mask, _mask
xtrct r0, r8
mov.l @r5+, _fg /* color_expanded */
mov r8, r0
mov.l .xram, r8
nop
mov.l r9, @-r15
mov #-5, r9
/* --- */
/* We use a DSP loop to avoid spending EX cycles on decrementing the
counter. However the DSP loop supports only up to 4096 cycles and we
have more than that. Do two passes. */
mov.l r10, @-r15
shlr _N
mov #2, r10
ldrs 2f
ldre 3f
1: ldrc _N
nop
/* --- */
2: mov r0, r1 /* x */
and _mask, r1
mov _fg, r0
sub r1, r0
mul.l r0, _alpha
mov.w @_frag+, r5 /* LS-based add #2, _frag */
sts macl, r5
nop
/* 2 cycles lost to the sts -> EX delay!
Not much can be done without tiling because this loop is already a
strict EX chain; everything else parallelizes with it. */
shld r9, r5
mov.w @(2, _frag), r0 /* (next pixel) */
add r5, r1
mov.w r0, @r8 /* (next pixel) */
and _mask, r1
mov.w r0, @(2, r8) /* (next pixel) */
swap.w r1, r5
mov.l @r8, r0 /* (next pixel) */
or r5, r1
3: mov.w r1, @_frag
/* --- */
dt r10
bf 1b
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
.balign 4
.mask:
.long 0x07e0f81f
.xram:
.long 0xe500e000