101 lines
1.7 KiB
ArmAsm
101 lines
1.7 KiB
ArmAsm
.global _bosonx_shader_fsblend
|
|
.align 4
|
|
|
|
#define _fg r2
|
|
#define _alpha r3 /* Opacity of the colored overlay (0..31) */
|
|
#define _N r4 /* Number of pixels in fragment */
|
|
#define _cmd r5 /* Command */
|
|
#define _frag r6 /* Fragment */
|
|
#define _mask r7 /* Pixel expansion mask */
|
|
|
|
/* Possible optimizations:
|
|
- Remove the huge sts -> EX delay by tiling. This is the only way. */
|
|
|
|
_bosonx_shader_fsblend:
|
|
/* Preload and duplicate the first pixel while we're setting up */
|
|
mov.w @_frag, r0
|
|
add #2, r5
|
|
|
|
mov.l r8, @-r15
|
|
add #-2, _frag
|
|
|
|
mov.w @r5+, _alpha /* alpha */
|
|
swap.w r0, r8
|
|
|
|
mov.l .mask, _mask
|
|
xtrct r0, r8
|
|
|
|
mov.l @r5+, _fg /* color_expanded */
|
|
mov r8, r0
|
|
|
|
mov.l .xram, r8
|
|
nop
|
|
|
|
mov.l r9, @-r15
|
|
mov #-5, r9
|
|
|
|
/* --- */
|
|
|
|
/* We use a DSP loop to avoid spending EX cycles on decrementing the
|
|
counter. However the DSP loop supports only up to 4096 cycles and we
|
|
have more than that. Do two passes. */
|
|
|
|
mov.l r10, @-r15
|
|
shlr _N
|
|
|
|
mov #2, r10
|
|
ldrs 2f
|
|
ldre 3f
|
|
|
|
1: ldrc _N
|
|
nop
|
|
|
|
/* --- */
|
|
|
|
2: mov r0, r1 /* x */
|
|
and _mask, r1
|
|
|
|
mov _fg, r0
|
|
sub r1, r0
|
|
|
|
mul.l r0, _alpha
|
|
mov.w @_frag+, r5 /* LS-based add #2, _frag */
|
|
|
|
sts macl, r5
|
|
nop
|
|
|
|
/* 2 cycles lost to the sts -> EX delay!
|
|
Not much can be done without tiling because this loop is already a
|
|
strict EX chain; everything else parallelizes with it. */
|
|
|
|
shld r9, r5
|
|
mov.w @(2, _frag), r0 /* (next pixel) */
|
|
|
|
add r5, r1
|
|
mov.w r0, @r8 /* (next pixel) */
|
|
|
|
and _mask, r1
|
|
mov.w r0, @(2, r8) /* (next pixel) */
|
|
|
|
swap.w r1, r5
|
|
mov.l @r8, r0 /* (next pixel) */
|
|
|
|
or r5, r1
|
|
3: mov.w r1, @_frag
|
|
|
|
/* --- */
|
|
|
|
dt r10
|
|
bf 1b
|
|
|
|
mov.l @r15+, r10
|
|
mov.l @r15+, r9
|
|
rts
|
|
mov.l @r15+, r8
|
|
|
|
.balign 4
|
|
.mask:
|
|
.long 0x07e0f81f
|
|
.xram:
|
|
.long 0xe500e000
|