#include .global _memchr .type _memchr, @function _memchr: mov r4, r1 exts.b r5, r5 /* For small inputs, simply check bytes individually */ mov #64, r2 cmp/hi r6, r2 bt .naive /* Make a 4-byte version of r5 for cmp/str */ extu.b r5, r3 swap.b r3, r2 or r3, r2 swap.w r2, r3 or r3, r2 mov.l .___cpucap, r0 mov.l @r0, r0 tst #__CPUCAP_SH4ALDSP, r0 bf .sh4aldsp /* ** Fast memchr() method on SH3: ** -> Align to 4 bytes with single-byte reads ** -> Then read 4 bytes at a time, and check for r5 with with cmp/str ** -> Use a somewhat tight longword-based loop with dt */ .sh3: /* First check 3 bytes to ensure we don't skip bytes when aligning */ mov.b @r1+, r0 cmp/eq r0, r5 bt .end mov.b @r1+, r0 cmp/eq r0, r5 bt .end mov.b @r1+, r0 cmp/eq r0, r5 bt .end /* Align to a 4-byte boundary */ shlr2 r1 shll2 r1 add r4, r6 sub r1, r6 mov r6, r7 shlr2 r7 mov #3, r3 and r3, r6 /* Read longwords */ 1: mov.l @r1+, r0 cmp/str r0, r2 bt .found dt r7 bf 1b /* Finish the last bytes with a naive method */ bra .naive nop /* ** Fast memchr() method on SH4AL-DSP: ** -> Align with an unaligned read ** -> Then read 4 bytes at a time, and check for r5 with with cmp/str ** -> Use an extremely tight loop with the DSP repeat function */ .sh4aldsp: /* Check a couple of unaligned bytes first */ movua.l @r1+, r0 cmp/str r0, r2 bt .found /* Align to a 4-byte boundary */ shlr2 r1 shll2 r1 add r4, r6 sub r1, r6 mov r6, r7 shlr2 r7 ldrs 2f ldre 3f ldrc r7 mov #3, r3 and r3, r6 /* Read longwords super efficiently */ 2: mov.l @r1+, r0 cmp/str r0, r2 3: bt .found /* Finish the last few bytes with the naive method */ .naive: /* Don't read if there are no bytes left */ tst r6, r6 bt .none 4: mov.b @r1+, r0 cmp/eq r0, r5 bt .end dt r6 bf 4b .none: rts mov #0, r0 .found: /* Go back to find out which of the last 4 bytes is r5 */ add #-4, r1 bra 4b mov #4, r6 .end: mov r1, r0 rts add #-1, r0 .___cpucap: .long ___cpucap