#include <bits/asm/cpucap.h>

.global _memchr
.type _memchr, @function

_memchr:
	mov	r4, r1
	exts.b	r5, r5

	/* For small inputs, simply check bytes individually */
	mov	#64, r2
	cmp/hi	r6, r2
	bt	.naive

	/* Make a 4-byte version of r5 for cmp/str */
	extu.b	r5, r3
	swap.b	r3, r2
	or	r3, r2
	swap.w	r2, r3
	or	r3, r2

	mov.l	.___cpucap, r0
	mov.l	@r0, r0
	tst	#__CPUCAP_SH4ALDSP, r0
	bf	.sh4aldsp

	/*
	** Fast memchr() method on SH3:
	** -> Align to 4 bytes with single-byte reads
	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
	** -> Use a somewhat tight longword-based loop with dt
	*/
.sh3:
	/* First check 3 bytes to ensure we don't skip bytes when aligning */
	mov.b	@r1+, r0
	cmp/eq	r0, r5
	bt	.end
	mov.b	@r1+, r0
	cmp/eq	r0, r5
	bt	.end
	mov.b	@r1+, r0
	cmp/eq	r0, r5
	bt	.end

	/* Align to a 4-byte boundary */
	shlr2	r1
	shll2	r1
	add	r4, r6
	sub	r1, r6

	mov	r6, r7
	shlr2	r7
	mov	#3, r3
	and	r3, r6

	/* Read longwords */
1:	mov.l	@r1+, r0
	cmp/str	r0, r2
	bt	.found
	dt	r7
	bf	1b

	/* Finish the last bytes with a naive method */
	bra	.naive
	nop

	/*
	** Fast memchr() method on SH4AL-DSP:
	** -> Align with an unaligned read
	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
	** -> Use an extremely tight loop with the DSP repeat function
	*/
.sh4aldsp:
	/* Check a couple of unaligned bytes first */
	movua.l	@r1+, r0
	cmp/str	r0, r2
	bt	.found

	/* Align to a 4-byte boundary */
	shlr2	r1
	shll2	r1
	add	r4, r6
	sub	r1, r6

	mov	r6, r7
	shlr2	r7
	ldrs	2f
	ldre	3f
	ldrc	r7
	mov	#3, r3
	and	r3, r6

	/* Read longwords super efficiently */
2:	mov.l	@r1+, r0
	cmp/str	r0, r2
3:	bt	.found

	/* Finish the last few bytes with the naive method */

.naive:	/* Don't read if there are no bytes left */
	tst	r6, r6
	bt	.none

4:	mov.b	@r1+, r0
	cmp/eq	r0, r5
	bt	.end
	dt	r6
	bf	4b

.none:	rts
	mov	#0, r0

.found:	/* Go back to find out which of the last 4 bytes is r5 */
	add	#-4, r1
	bra	4b
	mov	#4, r6

.end:	mov	r1, r0
	rts
	add	#-1, r0

.___cpucap:
	.long	___cpucap