.global _gint_image_p8_swapcolor #include "image_macros.S" /* P8 SWAPCOLOR, RAM version: by branchless xor selection. The core action of this loop is to render full pixels while replacing any occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is too slow as often, so instead we use the fact that both x and y are fixed to use the identity c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c) We materialize -(c == x) by subtracting a register from itself with subc after the comparison (which is delightfully elegant), while (x ^ y) is pre- computed. This way, the selection is performed in one [subc], one [and] and one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling (which only takes 2 EX slots) but still better than symmetric alternatives. Since we have a palette, we further trick by comparing against the index but selecting against the palette entry, ie. we do palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c]) which allows the computation to occur in parallel with the palette access and does not require the replacement value to be located at a valid index. r0: [temporary] r7: cmd.color_1 r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y) r9: Palette r10: Holds (x ^ y) & -(c == x) during selection */ .macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR mov.l @r8+, r9 /* cmd.palette */ mov.w @r8+, r0 /* cmd.edge_2 (don't care) */ mov.w @r8+, r7 /* cmd.color_1 */ mov.l r10, @-r15 exts.b r7, r7 mov r7, r0 mov.w @r8, r8 /* cmd.color_2 */ add r0, r0 mov.w @(r0, r9), r0 xor r0, r8 .if \HFLIP add #-2, r5 mov r2, r0 shll r0 add r0, r5 shll r0 add r0, r6 .endif START 2: mov.b @r3+, r0 cmp/eq r0, r7 add r0, r0 subc r10, r10 mov.w @(r0, r9), r0 and r8, r10 xor r10, r0 mov.w r0, @r5 3: add #\OUT_DIR, r5 END mov.l @r15+, r10 EPILOGUE .endm _gint_image_p8_swapcolor: tst #1, r0 bf 9f GEN_SWAPCOLOR_LOOP 0, 2 9: GEN_SWAPCOLOR_LOOP 1, -2