106 lines
2.1 KiB
ArmAsm
106 lines
2.1 KiB
ArmAsm
.global _CGD_sector_memcmp
|
|
.align 4
|
|
|
|
# A pretty fast memcmp for 512-byte sectors, with equal(0)-different(1) output
|
|
# r4: 32-aligned pointer to sector in RAM (preferably 1-cycle operand bus RAM)
|
|
# r5: 32-aligned pointer to sector in ROM
|
|
# r6: 512 (ignored; for compatibility with memcmp prototype)
|
|
#
|
|
# There are two main ideas in this code:
|
|
#
|
|
# * Read with words, since such is the affinity of the ROM. (I don't know why.)
|
|
# I tested with longwords, the performance is much worse; bytes are somewhere
|
|
# in-between, which tormented me as I wondered why the most trivial memcmp()
|
|
# with poor assembler from libfxcg was faster than my hand-written function.
|
|
#
|
|
# * Weave iterations with smart register allocation to exploit superscalar
|
|
# parallelism. We read to r0/r1 while comparing r2/r3, then vice-versa. The
|
|
# two mov.w (LS) for one comparison execute in parallel with the cmp (EX) and
|
|
# bf (BR) of the previous comparison, so overall one comparison takes 2
|
|
# cycles (plus any extra cycles in ROM reads if the cache isn't hit or
|
|
# doesn't respond immediately, and some loop overhead).
|
|
#
|
|
_CGD_sector_memcmp:
|
|
# For the first 32 bytes, compare as fast as possible to exit early
|
|
# when the sectors don't match (this saves a little bit).
|
|
mov #16, r7
|
|
1: mov.w @r5+, r0
|
|
mov.w @r4+, r1
|
|
cmp/eq r0, r1
|
|
bf .fail
|
|
dt r7
|
|
bf 1b
|
|
|
|
mov #30, r7
|
|
|
|
.line:
|
|
# There is a 2-cycle delay for the RAW dependency between each mov.b
|
|
# and the corresponding use. Here the delay is honored so there are no
|
|
# cycles lost to RAW dependencies.
|
|
|
|
mov.w @r5+, r0
|
|
nop
|
|
|
|
mov.w @r4+, r1
|
|
nop
|
|
|
|
mov.w @r5+, r2
|
|
nop
|
|
|
|
mov.w @r4+, r3
|
|
cmp/eq r0, r1
|
|
|
|
mov.w @r5+, r0
|
|
bf .fail
|
|
|
|
mov.w @r4+, r1
|
|
cmp/eq r2, r3
|
|
|
|
mov.w @r5+, r2
|
|
bf .fail
|
|
|
|
mov.w @r4+, r3
|
|
cmp/eq r0, r1
|
|
|
|
mov.w @r5+, r0
|
|
bf .fail
|
|
|
|
mov.w @r4+, r1
|
|
cmp/eq r2, r3
|
|
|
|
mov.w @r5+, r2
|
|
bf .fail
|
|
|
|
mov.w @r4+, r3
|
|
cmp/eq r0, r1
|
|
|
|
mov.w @r5+, r0
|
|
bf .fail
|
|
|
|
mov.w @r4+, r1
|
|
cmp/eq r2, r3
|
|
|
|
mov.w @r5+, r2
|
|
bf .fail
|
|
|
|
mov.w @r4+, r3
|
|
cmp/eq r0, r1
|
|
|
|
# These two can run in parallel (BR/EX)
|
|
bf .fail
|
|
cmp/eq r2, r3
|
|
|
|
bf .fail
|
|
|
|
dt r7
|
|
bf .line
|
|
|
|
.success:
|
|
rts
|
|
mov #0, r0
|
|
|
|
.fail:
|
|
# We don't specify an order
|
|
rts
|
|
mov #1, r0
|