.global _gint_image_p4_normal #include "image_macros.S" /* P4 Opaque rendering, VRAM version: by unrolling without edge pixels. This is the most unique function in the renderer, Azur included. A P4 image cannot reasonably be decoded on a per-pixel basis because extracting half- bytes is too slow. But using edge pixels results in extra write surface that makes us slower than bopti in gint 2.7. This loop is thus the only one to implement 2-unrolling (no pipeline) while manually avoiding the writes that a pair of edge pixels usually fix. Subtle adjustments to strides are involved, making this function one of the most tricky. A slight change is made to the command for the purpose of this function; cmd.edge_1 (which is r10) is set to indicate whether the [left] side of the box is even (r10=0) or odd (r10=1). This allows us to enter the loop at the correct position. r0: [temporary] r7: [temporary] r8: Column counter r9: Palette r10: box->left & 1 r11: [temporary] */ .macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2 mov.l @r8+, r9 /* cmd.palette */ add #-4, r5 /* Better positioning for @(OFF[12], r5) */ /* The following arithmetic is to decrease r4 if the width is even (r2 & 1) and left is odd (r10 = 1), since that means both the first and last pixel load a full byte but use only half */ mov r2, r0 xor #1, r0 mov.w @r8+, r7 /* cmd.edge_2 (don't care) */ and r10, r0 mov.l r11, @-r15 sub r0, r4 .if \HFLIP mov r2, r0 shll r0 add r0, r5 nop shll r0 nop add r0, r6 nop .endif 1: mov r2, r8 tst r10, r10 /* Check whether to do an extra half iter. */ bt 2f nop /* Additional half-iteration if box->left = 1 */ mov.b @r3+, r0 shll r0 and #0x1e, r0 mov.w @(r0, r9), r0 dt r8 mov.w r0, @(\OFF1, r5) bt.s 3f add #\OUT_DIR, r5 /* The main loop needs to load pixels in output order. This is not ideal for CPU usage, but we have some margins */ 2: mov.b @r3+, \TMP1 mov #-4, \TMP2 /* Stall */ shll \TMP1 mov \TMP1, r0 shld \TMP2, r0 nop and #0x1e, r0 mov #0x1e, \TMP2 /* Stall */ mov.w @(r0,r9), r0 and \TMP2, \TMP1 dt r8 mov.w r0, @(\OFF1,r5) bt.s 3f add #\OUT_DIR, r5 mov \TMP1, r0 add #\OUT_DIR, r5 dt r8 mov.w @(r0,r9), r0 bf.s 2b mov.w r0, @(\OFF2,r5) 3: END mov.l @r15+, r11 mov.l @r15+, r10 EPILOGUE .endm _gint_image_p4_normal: tst #1, r0 bf 9f GEN_NORMAL_LOOP 0, 2, r7, r11, 4, 2 9: GEN_NORMAL_LOOP 1, -2, r7, r11, 2, 4