.global _gint_image_p4_loop /* gint's image renderer: 4-bit indexed entry point P4 compacts pixel data further than P8 by restricting values to a 16-color palette and packing 2 pixels in each byte. This severely restricts our ability to use sub-images because odd positions land within bytes. Fortunately, we can solve this by using more edge pixels. The simplest way to write a P4 loop is to process 2 pixels from a 2-aligned source image position in a single iteration. Other structures don't even come close in terms of CPU performance (which, as a reminder, is the main bottleneck in Azur but not in gint): selecting nibbles individually is too long, while not unrolling is still clearly inefficient. So it becomes very important to forcibly align the sub-image on byte-aligned input boundaries and stick to that grid. Obviously, this approach causes up to one extra pixel to be overwritten on each side of every line. We solve this problem by adding *another* edge pixel on the left side. In the renderer this is called the left edge or edge_1, while the standard one is called right edge or edge_2. r0: - (initially: cmd.effect) r1: Number of lines remaining to draw r2: Number of columns per line r3: Input pointer r4: Input stride r5: Output pointer r6: Output stride r7: Right edge pointer r8: - (initially: cmd) r9: - (initially: cmd.loop) r10: Left edge pointer */ _gint_image_p4_loop: /* r4: int output_width (pixels) r5: struct gint_image_cmd *cmd */ mov.b @(1,r5), r0 /* cmd.effect */ add #2, r5 mov.w @r5+, r2 /* cmd.columns */ mov r4, r6 mov.l r8, @-r15 mov r5, r8 /* For here on the command is r8 */ mov.l r9, @-r15 sub r2, r6 mov.w @r8+, r4 /* cmd.input_stride */ add r6, r6 mov.b @r8+, r1 /* cmd.lines */ nop mov.l r10, @-r15 extu.b r1, r1 mov.b @r8+, r10 /* cmd.edge_1 */ nop mov.l @r8+, r9 shlr r0 /* T bit is now VFLIP */ mov.l @r8+, r5 /* cmd.output */ nop bf.s _NO_VFLIP mov.l @r8+, r3 /* cmd.input */ _VFLIP: neg r4, r4 nop _NO_VFLIP: mov r2, r7 shlr r7 jmp @r9 subc r7, r4