diff --git a/src/libc/string/target/sh-generic/memchr.S b/src/libc/string/target/sh-generic/memchr.S
index e9556a7..483079c 100644
--- a/src/libc/string/target/sh-generic/memchr.S
+++ b/src/libc/string/target/sh-generic/memchr.S
@@ -1,38 +1,52 @@
+#include <bits/asm/cpucap.h>
+
 .global _memchr
 .type _memchr, @function
 
 _memchr:
-	mov	r4, r0
+	mov	r4, r1
 	exts.b	r5, r5
 
 	/* For small inputs, simply check bytes individually */
 	mov	#64, r2
 	cmp/hi	r6, r2
-	bt	.last
+	bt	.naive
 
-.large:	/* Make a 4-byte version of r5 for cmp/str */
+	/* Make a 4-byte version of r5 for cmp/str */
 	extu.b	r5, r3
 	swap.b	r3, r2
 	or	r3, r2
 	swap.w	r2, r3
 	or	r3, r2
 
+	mov.l	.___cpucap, r0
+	mov.l	@r0, r0
+	tst	#__CPUCAP_SH4ALDSP, r0
+	bf	.sh4aldsp
+
+	/*
+	** Fast memchr() method on SH3:
+	** -> Align to 4 bytes with single-byte reads
+	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
+	** -> Use a somewhat tight longword-based loop with dt
+	*/
+.sh3:
 	/* First check 3 bytes to ensure we don't skip bytes when aligning */
-	mov.b	@r0+, r1
-	cmp/eq	r1, r5
+	mov.b	@r1+, r0
+	cmp/eq	r0, r5
 	bt	.end
-	mov.b	@r0+, r1
-	cmp/eq	r1, r5
+	mov.b	@r1+, r0
+	cmp/eq	r0, r5
 	bt	.end
-	mov.b	@r0+, r1
-	cmp/eq	r1, r5
+	mov.b	@r1+, r0
+	cmp/eq	r0, r5
 	bt	.end
 
 	/* Align to a 4-byte boundary */
-	shlr2	r0
-	shll2	r0
+	shlr2	r1
+	shll2	r1
 	add	r4, r6
-	sub	r0, r6
+	sub	r1, r6
 
 	mov	r6, r7
 	shlr2	r7
@@ -40,29 +54,70 @@ _memchr:
 	and	r3, r6
 
 	/* Read longwords */
-1:	mov.l	@r0+, r1
-	cmp/str	r1, r2
+1:	mov.l	@r1+, r0
+	cmp/str	r0, r2
 	bt	.found
 	dt	r7
 	bf	1b
 
-.last:	/* Don't read if there are no bytes left */
+	/* Finish the last bytes with a naive method */
+	bra	.naive
+	nop
+
+	/*
+	** Fast memchr() method on SH4AL-DSP:
+	** -> Align with an unaligned read
+	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
+	** -> Use an extremely tight loop with the DSP repeat function
+	*/
+.sh4aldsp:
+	/* Check a couple of unaligned bytes first */
+	movua.l	@r1+, r0
+	cmp/str	r0, r2
+	bt	.found
+
+	/* Align to a 4-byte boundary */
+	shlr2	r1
+	shll2	r1
+	add	r4, r6
+	sub	r1, r6
+
+	mov	r6, r7
+	shlr2	r7
+	ldrs	2f
+	ldre	3f
+	ldrc	r7
+	mov	#3, r3
+	and	r3, r6
+
+	/* Read longwords super efficiently */
+2:	mov.l	@r1+, r0
+	cmp/str	r0, r2
+3:	bt	.found
+
+	/* Finish the last few bytes with the naive method */
+
+.naive:	/* Don't read if there are no bytes left */
 	tst	r6, r6
 	bt	.none
 
-2:	mov.b	@r0+, r1
-	cmp/eq	r1, r5
+4:	mov.b	@r1+, r0
+	cmp/eq	r0, r5
 	bt	.end
 	dt	r6
-	bf	2b
+	bf	4b
 
 .none:	rts
 	mov	#0, r0
 
 .found:	/* Go back to find out which of the last 4 bytes is r5 */
-	add	#-4, r0
-	bra	2b
+	add	#-4, r1
+	bra	4b
 	mov	#4, r6
 
-.end:	rts
+.end:	mov	r1, r0
+	rts
 	add	#-1, r0
+
+.___cpucap:
+	.long	___cpucap