CGDoom/src-cg/cgdoom-asm.s

.global _CGD_sector_memcmp
.align 4

# A pretty fast memcmp for 512-byte sectors, with equal(0)-different(1) output
# r4: 32-aligned pointer to sector in RAM (preferably 1-cycle operand bus RAM)
# r5: 32-aligned pointer to sector in ROM
# r6: 512 (ignored; for compatibility with memcmp prototype)
#
# There are two main ideas in this code:
#
# * Read with words, since such is the affinity of the ROM. (I don't know why.)
#   I tested with longwords, the performance is much worse; bytes are somewhere
#   in-between, which tormented me as I wondered why the most trivial memcmp()
#   with poor assembler from libfxcg was faster than my hand-written function.
#
# * Weave iterations with smart register allocation to exploit superscalar
#   parallelism. We read to r0/r1 while comparing r2/r3, then vice-versa. The
#   two mov.w (LS) for one comparison execute in parallel with the cmp (EX) and
#   bf (BR) of the previous comparison, so overall one comparison takes 2
#   cycles (plus any extra cycles in ROM reads if the cache isn't hit or
#   doesn't respond immediately, and some loop overhead).
#
_CGD_sector_memcmp:
	# For the first 32 bytes, compare as fast as possible to exit early
	# when the sectors don't match (this saves a little bit).
	mov	#16, r7
1:	mov.w	@r5+, r0
	mov.w	@r4+, r1
	cmp/eq	r0, r1
	bf	.fail
	dt	r7
	bf	1b

	mov	#30, r7

.line:
	# There is a 2-cycle delay for the RAW dependency between each mov.b
	# and the corresponding use. Here the delay is honored so there are no
	# cycles lost to RAW dependencies.

	mov.w	@r5+, r0
	nop

	mov.w	@r4+, r1
	nop

	mov.w	@r5+, r2
	nop

	mov.w	@r4+, r3
	cmp/eq	r0, r1

	mov.w	@r5+, r0
	bf	.fail

	mov.w	@r4+, r1
	cmp/eq	r2, r3

	mov.w	@r5+, r2
	bf	.fail

	mov.w	@r4+, r3
	cmp/eq	r0, r1

	mov.w	@r5+, r0
	bf	.fail

	mov.w	@r4+, r1
	cmp/eq	r2, r3

	mov.w	@r5+, r2
	bf	.fail

	mov.w	@r4+, r3
	cmp/eq	r0, r1

	mov.w	@r5+, r0
	bf	.fail

	mov.w	@r4+, r1
	cmp/eq	r2, r3

	mov.w	@r5+, r2
	bf	.fail

	mov.w	@r4+, r3
	cmp/eq	r0, r1

	# These two can run in parallel (BR/EX)
	bf	.fail
	cmp/eq	r2, r3

	bf	.fail

	dt	r7
	bf	.line

.success:
	rts
	mov	#0, r0

.fail:
	# We don't specify an order
	rts
	mov	#1, r0