CGDoom/src-cg/cgdoom-asm.s

106 lines
2.1 KiB
ArmAsm

.global _CGD_sector_memcmp
.align 4
# A pretty fast memcmp for 512-byte sectors, with equal(0)-different(1) output
# r4: 32-aligned pointer to sector in RAM (preferably 1-cycle operand bus RAM)
# r5: 32-aligned pointer to sector in ROM
# r6: 512 (ignored; for compatibility with memcmp prototype)
#
# There are two main ideas in this code:
#
# * Read with words, since such is the affinity of the ROM. (I don't know why.)
# I tested with longwords, the performance is much worse; bytes are somewhere
# in-between, which tormented me as I wondered why the most trivial memcmp()
# with poor assembler from libfxcg was faster than my hand-written function.
#
# * Weave iterations with smart register allocation to exploit superscalar
# parallelism. We read to r0/r1 while comparing r2/r3, then vice-versa. The
# two mov.w (LS) for one comparison execute in parallel with the cmp (EX) and
# bf (BR) of the previous comparison, so overall one comparison takes 2
# cycles (plus any extra cycles in ROM reads if the cache isn't hit or
# doesn't respond immediately, and some loop overhead).
#
_CGD_sector_memcmp:
# For the first 32 bytes, compare as fast as possible to exit early
# when the sectors don't match (this saves a little bit).
mov #16, r7
1: mov.w @r5+, r0
mov.w @r4+, r1
cmp/eq r0, r1
bf .fail
dt r7
bf 1b
mov #30, r7
.line:
# There is a 2-cycle delay for the RAW dependency between each mov.b
# and the corresponding use. Here the delay is honored so there are no
# cycles lost to RAW dependencies.
mov.w @r5+, r0
nop
mov.w @r4+, r1
nop
mov.w @r5+, r2
nop
mov.w @r4+, r3
cmp/eq r0, r1
mov.w @r5+, r0
bf .fail
mov.w @r4+, r1
cmp/eq r2, r3
mov.w @r5+, r2
bf .fail
mov.w @r4+, r3
cmp/eq r0, r1
mov.w @r5+, r0
bf .fail
mov.w @r4+, r1
cmp/eq r2, r3
mov.w @r5+, r2
bf .fail
mov.w @r4+, r3
cmp/eq r0, r1
mov.w @r5+, r0
bf .fail
mov.w @r4+, r1
cmp/eq r2, r3
mov.w @r5+, r2
bf .fail
mov.w @r4+, r3
cmp/eq r0, r1
# These two can run in parallel (BR/EX)
bf .fail
cmp/eq r2, r3
bf .fail
dt r7
bf .line
.success:
rts
mov #0, r0
.fail:
# We don't specify an order
rts
mov #1, r0