diff --git a/src/libc/string/target/sh-generic/memchr.S b/src/libc/string/target/sh-generic/memchr.S index e9556a7..483079c 100644 --- a/src/libc/string/target/sh-generic/memchr.S +++ b/src/libc/string/target/sh-generic/memchr.S @@ -1,38 +1,52 @@ +#include + .global _memchr .type _memchr, @function _memchr: - mov r4, r0 + mov r4, r1 exts.b r5, r5 /* For small inputs, simply check bytes individually */ mov #64, r2 cmp/hi r6, r2 - bt .last + bt .naive -.large: /* Make a 4-byte version of r5 for cmp/str */ + /* Make a 4-byte version of r5 for cmp/str */ extu.b r5, r3 swap.b r3, r2 or r3, r2 swap.w r2, r3 or r3, r2 + mov.l .___cpucap, r0 + mov.l @r0, r0 + tst #__CPUCAP_SH4ALDSP, r0 + bf .sh4aldsp + + /* + ** Fast memchr() method on SH3: + ** -> Align to 4 bytes with single-byte reads + ** -> Then read 4 bytes at a time, and check for r5 with with cmp/str + ** -> Use a somewhat tight longword-based loop with dt + */ +.sh3: /* First check 3 bytes to ensure we don't skip bytes when aligning */ - mov.b @r0+, r1 - cmp/eq r1, r5 + mov.b @r1+, r0 + cmp/eq r0, r5 bt .end - mov.b @r0+, r1 - cmp/eq r1, r5 + mov.b @r1+, r0 + cmp/eq r0, r5 bt .end - mov.b @r0+, r1 - cmp/eq r1, r5 + mov.b @r1+, r0 + cmp/eq r0, r5 bt .end /* Align to a 4-byte boundary */ - shlr2 r0 - shll2 r0 + shlr2 r1 + shll2 r1 add r4, r6 - sub r0, r6 + sub r1, r6 mov r6, r7 shlr2 r7 @@ -40,29 +54,70 @@ _memchr: and r3, r6 /* Read longwords */ -1: mov.l @r0+, r1 - cmp/str r1, r2 +1: mov.l @r1+, r0 + cmp/str r0, r2 bt .found dt r7 bf 1b -.last: /* Don't read if there are no bytes left */ + /* Finish the last bytes with a naive method */ + bra .naive + nop + + /* + ** Fast memchr() method on SH4AL-DSP: + ** -> Align with an unaligned read + ** -> Then read 4 bytes at a time, and check for r5 with with cmp/str + ** -> Use an extremely tight loop with the DSP repeat function + */ +.sh4aldsp: + /* Check a couple of unaligned bytes first */ + movua.l @r1+, r0 + cmp/str r0, r2 + bt .found + + /* Align to a 4-byte boundary */ + shlr2 r1 + shll2 r1 + add r4, r6 + sub r1, r6 + + mov r6, r7 + shlr2 r7 + ldrs 2f + ldre 3f + ldrc r7 + mov #3, r3 + and r3, r6 + + /* Read longwords super efficiently */ +2: mov.l @r1+, r0 + cmp/str r0, r2 +3: bt .found + + /* Finish the last few bytes with the naive method */ + +.naive: /* Don't read if there are no bytes left */ tst r6, r6 bt .none -2: mov.b @r0+, r1 - cmp/eq r1, r5 +4: mov.b @r1+, r0 + cmp/eq r0, r5 bt .end dt r6 - bf 2b + bf 4b .none: rts mov #0, r0 .found: /* Go back to find out which of the last 4 bytes is r5 */ - add #-4, r0 - bra 2b + add #-4, r1 + bra 4b mov #4, r6 -.end: rts +.end: mov r1, r0 + rts add #-1, r0 + +.___cpucap: + .long ___cpucap