gint/src/render-cg/image/image_p4.S

84 lines
2.1 KiB
ArmAsm

.global _gint_image_p4_loop
/* gint's image renderer: 4-bit indexed entry point
P4 compacts pixel data further than P8 by restricting values to a 16-color
palette and packing 2 pixels in each byte. This severely restricts our
ability to use sub-images because odd positions land within bytes.
Fortunately, we can solve this by using more edge pixels. The simplest way
to write a P4 loop is to process 2 pixels from a 2-aligned source image
position in a single iteration. Other structures don't even come close in
terms of CPU performance (which, as a reminder, is the main bottleneck in
Azur but not in gint): selecting nibbles individually is too long, while not
unrolling is still clearly inefficient. So it becomes very important to
forcibly align the sub-image on byte-aligned input boundaries and stick to
that grid.
Obviously, this approach causes up to one extra pixel to be overwritten on
each side of every line. We solve this problem by adding *another* edge
pixel on the left side. In the renderer this is called the left edge or
edge_1, while the standard one is called right edge or edge_2.
r0: - (initially: cmd.effect)
r1: Number of lines remaining to draw
r2: Number of columns per line
r3: Input pointer
r4: Input stride
r5: Output pointer
r6: Output stride
r7: Right edge pointer
r8: - (initially: cmd)
r9: - (initially: cmd.loop)
r10: Left edge pointer */
_gint_image_p4_loop:
/* r4: int output_width (pixels)
r5: struct gint_image_cmd *cmd */
mov.b @(1,r5), r0 /* cmd.effect */
add #2, r5
mov.w @r5+, r2 /* cmd.columns */
mov r4, r6
mov.l r8, @-r15
mov r5, r8
/* For here on the command is r8 */
mov.l r9, @-r15
sub r2, r6
mov.w @r8+, r4 /* cmd.input_stride */
add r6, r6
mov.b @r8+, r1 /* cmd.lines */
nop
mov.l r10, @-r15
extu.b r1, r1
mov.b @r8+, r10 /* cmd.edge_1 */
nop
mov.l @r8+, r9
shlr r0 /* T bit is now VFLIP */
mov.l @r8+, r5 /* cmd.output */
nop
bf.s _NO_VFLIP
mov.l @r8+, r3 /* cmd.input */
_VFLIP:
neg r4, r4
nop
_NO_VFLIP:
mov r2, r7
shlr r7
jmp @r9
subc r7, r4