fxlibc/src/libc/string/target/sh-generic/memcmp.S

118 lines
1.7 KiB
ArmAsm

#include <bits/asm/cpucap.h>
.global _memcmp
.text
_memcmp:
tst r6, r6
bt .zero
/* When comparing less than 64 bytes, use the naive method */
mov #64, r0
cmp/ge r6, r0
bt _naive_memcmp
mov #4, r2
mov #3, r3
_memcmp_align_rhs:
/* 4-align the right-hand side */
mov.b @r4+, r0
mov.b @r5+, r1
cmp/eq r0, r1
bf/s .end
dt r6
tst r3, r5
bf _memcmp_align_rhs
/* If left-hand side is 4-aligned, use mov.l */
tst r3, r4
bt .aligned4
/* If unaligned but SH4, use movua.l */
mov.l .___cpucap, r0
mov.l @r0, r0
tst #__CPUCAP_SH4ALDSP, r0
bf .unaligned4
/* If left-hand side is 2-aligned, use mov.w and mov.l */
mov r4, r0
tst #1, r0
bt .aligned2
/* Otherwise use a naive comparison */
bra _naive_memcmp
nop
.aligned4:
/* Compare 4 bytes at a time until at most 4 bytes are left */
mov.l @r4+, r0
mov.l @r5+, r1
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .aligned4
bra _naive_memcmp
nop
.unaligned4:
/* Compare 4 bytes at a time until at most 4 bytes are left. Since
left-hand side is aligned, use movua.l */
movua.l @r4+, r0
mov.l @r5+, r1
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .unaligned4
bra _naive_memcmp
nop
.aligned2:
/* Read 4 bytes from r4 in two steps */
mov.w @r4+, r0
mov.l @r5+, r1
mov.w @r4+, r2
shll16 r0
or r2, r0
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .aligned2
bra _naive_memcmp
nop
_fail:
/* Rewind 4 bytes to compare manually */
add #-4, r4
add #-4, r5
add #4, r6
_naive_memcmp:
mov.b @r4+, r0
mov.b @r5+, r1
cmp/eq r0, r1
bf/s .end
dt r6
bf _naive_memcmp
.end:
extu.b r0, r0
extu.b r1, r1
rts
sub r1, r0
.zero:
rts
mov #0, r0
.align 4
.___cpucap:
.long ___cpucap