gint/src/render-cg/bopti-asm.s

314 lines
4.1 KiB
ArmAsm

.global _bopti_r5g6b5
.global _bopti_r5g6b5a
.global _bopti_p8
.global _bopti_p4
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: (tmp)
# r2: width - 1
# r3: target & 2
# ---
# r4: data
# r5: target
# r6: width; then, the number of longword operations
# r7: height
# ---
# r8: in_stride
# r9: out_stride
# r10: x counter
# ---
# @12: in_stride
# @16: out_stride
.align 4
_bopti_r5g6b5:
# Target alignment, either 0 (4-aligned) or 2 (2-aligned)
mov.l r8, @-r15
mov r5, r3
mov.l r9, @-r15
mov #2, r0
mov.l r10, @-r15
and r0, r3
# width-1, used to copy the last longword
mov r6, r2
add #-1, r2
shll r2
# Input and output strides. Add ending alignment because there is no
# corresponding increment in the y-loop.
mov.l @(12, r15), r8
mov.l @(16, r15), r9
mov r5, r0
shll r6
add r6 ,r0
and #2, r0
add r0, r8
add r0, r9
# Number of longword operations per row
sub r3, r6
shlr2 r6
.r5g6b5_y:
# First longword
mov.w @r4, r1
mov r2, r0
mov.w r1, @r5
# Last longword; align to 4-byte boundaries for target
mov.w @(r0, r4), r1
add r3, r4
mov.w r1, @(r0, r5)
add r3, r5
mov r6, r10
.r5g6b5_x:
# Copy longwords
movua.l @r4+, r0
mov.l r0, @r5
dt r10
bf.s .r5g6b5_x
add #4, r5
# -
add r8, r4
dt r7
bf.s .r5g6b5_y
add r9, r5
# -
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: in_stride
# r2: out_stride
# r3: x counter
# ---
# r4: data
# r5: target
# r6: width
# r7: height
# ---
# r8: alpha
# ---
# @4: in_stride
# @8: out_stride
# @12: alpha
.align 4
_bopti_r5g6b5a:
# Load alpha value
mov.l r8, @-r15
mov.l @(12, r15), r8
# Load input and output strides
mov.l @(4, r15), r1
mov.l @(8, r15), r2
.r5g6b5a_y:
mov r6, r3
.r5g6b5a_x:
mov.w @r4+, r0
cmp/eq r0, r8
bt .r5g6b5a_alpha
mov.w r0, @r5
.r5g6b5a_alpha:
dt r3
bf.s .r5g6b5a_x
add #2, r5
# -
add r1, r4
dt r7
bf.s .r5g6b5a_y
add r2, r5
# -
rts
mov.l @r15+, r8
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: in_stride
# r2: out_stride
# r3: x counter
# ---
# r4: data
# r5: target
# r6: width
# r7: height
# ---
# r8: palette
# r9: alpha
# ---
# @8: in_stride
# @12: out_stride
# @16: palette
# @20: alpha
.align 4
_bopti_p8:
# Load palette and in/out strides
mov.l r8, @-r15
mov.l r9, @-r15
mov.l @(16, r15), r8
mov.l @(8, r15), r1
mov.l @(12, r15), r2
# Load alpha value
mov.l @(20, r15), r9
.p8_y:
mov r6, r3
.p8_x:
mov.b @r4+, r0
extu.b r0, r0
cmp/eq r0, r9
bt .p8_alpha
# Pass pixel through palette
shll r0
mov.w @(r0, r8), r0
mov.w r0, @r5
.p8_alpha:
dt r3
bf.s .p8_x
add #2, r5
# -
add r1, r4
dt r7
bf.s .p8_y
add r2, r5
# -
mov.l @r15+, r9
rts
mov.l @r15+, r8
# REGISTER ALLOCATION:
# r0: (tmp)
# r1: in_stride (in pixels)
# r2: out_stride
# r3: x counter
# ---
# r4: data
# r5: target
# r6: width
# r7: height
# ---
# r8: palette
# r9: number of pixels of offset into data (r4)
# r10: alpha
# ---
# @12: in_stride
# @16: out_stride
# @20: palette
# @24: alpha
# @28: initial offset (in pixels)
.align 4
_bopti_p4:
mov.l r8, @-r15
mov.l r9, @-r15
mov.l r10, @-r15
# Load palette and in/out strides
mov.l @(20, r15), r8
mov.l @(12, r15), r1
mov.l @(16, r15), r2
# Load initial offset
mov.l @(28, r15), r9
# Load alpha value
mov.l @(24, r15), r10
shll r10
.p4_y:
mov r6, r3
.p4_x:
# Load 4 bits from offset r9 (in pixels) within image data (r4). Note
# that [shlr] puts bit 0 of the shifted register in T.
mov r9, r0
add #1, r9
shlr r0
bt.s .p4_x_unaligned
mov.b @(r0, r4), r0
.p4_x_aligned:
# Load 4 bits from the higher half of @r4 and use them to index the
# palette. Since the palette has two-byte entries, we need the color
# bits to be in position 000xxxx0.
shlr2 r0
shlr r0
and #0x1e, r0
cmp/eq r0, r10
bt .p4_alpha
# Pass pixel through palette
mov.w @(r0, r8), r0
bra .p4_alpha
mov.w r0, @r5
.p4_x_unaligned:
# Load 4 bits from the lower half of @r4 into position 000xxxx0.
shll r0
and #0x1e, r0
cmp/eq r0, r10
bt .p4_alpha
# Pass pixel through palette
mov.w @(r0, r8), r0
mov.w r0, @r5
.p4_alpha:
dt r3
bf.s .p4_x
add #2, r5
.p4_y_end:
add r1, r9
dt r7
bf.s .p4_y
add r2, r5
# -
mov.l @r15+, r10
mov.l @r15+, r9
rts
mov.l @r15+, r8