.global _bosonx_shader_fsblend .align 4 #define _fg r2 #define _alpha r3 /* Opacity of the colored overlay (0..31) */ #define _N r4 /* Number of pixels in fragment */ #define _cmd r5 /* Command */ #define _frag r6 /* Fragment */ #define _mask r7 /* Pixel expansion mask */ /* Possible optimizations: - Remove the huge sts -> EX delay by tiling. This is the only way. */ _bosonx_shader_fsblend: /* Preload and duplicate the first pixel while we're setting up */ mov.w @_frag, r0 add #2, r5 mov.l r8, @-r15 add #-2, _frag mov.w @r5+, _alpha /* alpha */ swap.w r0, r8 mov.l .mask, _mask xtrct r0, r8 mov.l @r5+, _fg /* color_expanded */ mov r8, r0 mov.l .xram, r8 nop mov.l r9, @-r15 mov #-5, r9 /* --- */ /* We use a DSP loop to avoid spending EX cycles on decrementing the counter. However the DSP loop supports only up to 4096 cycles and we have more than that. Do two passes. */ mov.l r10, @-r15 shlr _N mov #2, r10 ldrs 2f ldre 3f 1: ldrc _N nop /* --- */ 2: mov r0, r1 /* x */ and _mask, r1 mov _fg, r0 sub r1, r0 mul.l r0, _alpha mov.w @_frag+, r5 /* LS-based add #2, _frag */ sts macl, r5 nop /* 2 cycles lost to the sts -> EX delay! Not much can be done without tiling because this loop is already a strict EX chain; everything else parallelizes with it. */ shld r9, r5 mov.w @(2, _frag), r0 /* (next pixel) */ add r5, r1 mov.w r0, @r8 /* (next pixel) */ and _mask, r1 mov.w r0, @(2, r8) /* (next pixel) */ swap.w r1, r5 mov.l @r8, r0 /* (next pixel) */ or r5, r1 3: mov.w r1, @_frag /* --- */ dt r10 bf 1b mov.l @r15+, r10 mov.l @r15+, r9 rts mov.l @r15+, r8 .balign 4 .mask: .long 0x07e0f81f .xram: .long 0xe500e000