forked from Lephenixnoir/Azur
144 lines
2.6 KiB
ArmAsm
144 lines
2.6 KiB
ArmAsm
.global _azrp_shader_tex2d
|
|
.align 4
|
|
|
|
/* TODO [scaling]: Pass the _792 constant and fragment address as uniform */
|
|
|
|
/* Register assignment
|
|
r0: (temporary)
|
|
r1: Lines
|
|
r2: Columns
|
|
r3: Input
|
|
r4: Output
|
|
r5: Command queue; (temporary)
|
|
r6: (temporary)
|
|
r7: Output stride
|
|
r8: Input stride */
|
|
_azrp_shader_tex2d:
|
|
mov.w _792, r7
|
|
add #2, r5
|
|
|
|
mov.w @r5+, r2 /* Columns */
|
|
|
|
mov.l r8, @-r15
|
|
|
|
mov.w @r5+, r6 /* Input (1/2) */
|
|
sub r2, r7
|
|
|
|
mov.w @r5+, r3 /* Input (2/2) */
|
|
sub r2, r7
|
|
|
|
mov.w @r5+, r4 /* Output offset */
|
|
|
|
mov.w @r5+, r1 /* Lines */
|
|
shll16 r3
|
|
|
|
xtrct r6, r3
|
|
mov.l .fragment, r6
|
|
|
|
mov.w @r5+, r8 /* Input stride */
|
|
mov #8, r0 /* Maximum width for naive method */
|
|
|
|
add r6, r4
|
|
cmp/ge r2, r0
|
|
|
|
bt.s .naive
|
|
mov #2, r0
|
|
|
|
/* The following variations are named based on the parity of each parameter:
|
|
* w[eo] (width even, width odd)
|
|
* d[eo] (data even, data odd)
|
|
where even/odd means 4-aligned/2-aligned in terms of pointers.
|
|
|
|
When the destination and source have identical parity, the copy is pretty
|
|
direct and takes 2 cycles to copy 4 bytes. When they have opposite parity
|
|
however, longwords need to be rearranged, which is a problem: arithmetic
|
|
operations under a RAW dependency take 3 cycles, so there's no way to
|
|
complete the 4-byte copy in less than 4 cycles unless iterations are opened
|
|
and weaved, which would add too much sub-cases. So in this case the naive
|
|
method that copies 4 bytes in 4 cycles is used. A very heavy image renderer
|
|
like a tileset shader should consider the optimized route though. */
|
|
|
|
#define TEX2D_START() \
|
|
ldrs 2f; \
|
|
ldre 3f; \
|
|
\
|
|
1: ldrc r2; \
|
|
dt r1; \
|
|
|
|
#define TEX2D_END() \
|
|
add r7, r4; \
|
|
bf.s 1b; \
|
|
add r8, r3; \
|
|
\
|
|
rts; \
|
|
mov.l @r15+, r8
|
|
|
|
.case_analysis:
|
|
/* Use naive method for opposite source/destination parity */
|
|
mov r4, r6
|
|
xor r3, r6
|
|
tst r0, r6
|
|
bf .naive
|
|
|
|
shlr r2
|
|
bt .wo
|
|
|
|
.we:
|
|
tst r0, r4
|
|
bf .we_do
|
|
|
|
.we_de:
|
|
TEX2D_START()
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r4+
|
|
TEX2D_END()
|
|
|
|
.we_do:
|
|
add #-1, r2
|
|
|
|
TEX2D_START()
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r4+
|
|
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r4+
|
|
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r4+
|
|
TEX2D_END()
|
|
|
|
.wo:
|
|
tst r0, r4
|
|
bf .wo_do
|
|
|
|
.wo_de:
|
|
TEX2D_START()
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r4+
|
|
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r4+
|
|
TEX2D_END()
|
|
|
|
.wo_do:
|
|
TEX2D_START()
|
|
movs.w @r3+, x0
|
|
movs.w x0, @r4+
|
|
|
|
2: movs.l @r3+, x0
|
|
3: movs.l x0, @r4+
|
|
TEX2D_END()
|
|
|
|
/* Naive method for small widths and opposite source/destination parity */
|
|
.naive:
|
|
TEX2D_START()
|
|
2: movs.w @r3+, x0
|
|
3: movs.w x0, @r4+
|
|
TEX2D_END()
|
|
|
|
.align 4
|
|
.fragment:
|
|
.long _azrp_frag
|
|
_792:
|
|
.word 792
|