diff --git a/CMakeLists.txt b/CMakeLists.txt index a24ad0a..79b511a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,6 +172,7 @@ if(sh-generic IN_LIST TARGET_FOLDERS) src/libc/setjmp/target/sh-generic/setjmp.S src/libc/setjmp/target/sh-generic/longjmp.S src/libc/string/target/sh-generic/memchr.S + src/libc/string/target/sh-generic/memcpy.S src/libc/string/target/sh-generic/memset.S src/libc/string/target/sh-generic/strlen.S src/target/sh-generic/cpucap.c) diff --git a/src/libc/string/memcpy.c b/src/libc/string/memcpy.c index d7236b7..83ddade 100644 --- a/src/libc/string/memcpy.c +++ b/src/libc/string/memcpy.c @@ -1,17 +1,13 @@ #include #include -/* -** The memcpy() function copies n bytes from memory area src to memory area dest. -** The memory areas must not overlap. Use memmove(3) if the memory areas do -** overlap. -** -** TODO: use DMA ? -** TODO: use DSP ? -*/ +#ifndef __SUPPORT_ARCH_SH + void *memcpy(void *dest, const void *src, size_t count) { for (size_t i = 0; i < count; i = i + 1) ((uint8_t *) dest)[i] = ((uint8_t *) src)[i]; return (dest); } + +#endif /*__SUPPORT_ARCH_SH*/ diff --git a/src/libc/string/target/sh-generic/memchr.S b/src/libc/string/target/sh-generic/memchr.S index 483079c..c19ea3a 100644 --- a/src/libc/string/target/sh-generic/memchr.S +++ b/src/libc/string/target/sh-generic/memchr.S @@ -119,5 +119,7 @@ _memchr: rts add #-1, r0 +.align 4 + .___cpucap: .long ___cpucap diff --git a/src/libc/string/target/sh-generic/memcpy.S b/src/libc/string/target/sh-generic/memcpy.S new file mode 100644 index 0000000..62dfe76 --- /dev/null +++ b/src/libc/string/target/sh-generic/memcpy.S @@ -0,0 +1,128 @@ +#include + +.global _memcpy +.text + +_memcpy: + tst r6, r6 + bt .zero + + mov r4, r3 + mov #3, r2 + + /* When copying less than 64 bytes, use the naive method */ + mov #64, r0 + cmp/ge r6, r0 + bt _naive_memcpy + +_memcpy_align_dst: + /* 4-align the destination */ + mov.b @r5+, r0 + mov.b r0, @r4 + add #1, r4 + tst r2, r4 + bf/s _memcpy_align_dst + dt r6 + + /* If source is 4-aligned, use mov.l */ + tst r2, r5 + bt/s .aligned4_32 + mov #4, r2 + + /* If unaligned but SH4, use movua.l */ + mov.l .___cpucap, r0 + mov.l @r0, r0 + tst #__CPUCAP_SH4ALDSP, r0 + bf .unaligned4 + + /* If source is 2-aligned, use mov.w */ + mov r5, r0 + tst #1, r0 + bt .aligned2 + + /* Otherwise use a naive copy */ + bra _naive_memcpy + nop + +.aligned4_32: + mov #36, r2 + + /* Copy 32 bytes at a time until at most 32 bytes are left */ + mov.l @r5+, r0 + mov.l @r5+, r1 + mov.l @r5+, r7 + mov.l r0, @r4 + mov.l r1, @(4,r4) + mov.l r7, @(8,r4) + mov.l @r5+, r0 + mov.l @r5+, r1 + mov.l @r5+, r7 + mov.l r0, @(12,r4) + mov.l r1, @(16,r4) + mov.l r7, @(20,r4) + mov.l @r5+, r0 + mov.l @r5+, r1 + add #-32, r6 + mov.l r0, @(24,r4) + mov.l r1, @(28,r4) + cmp/ge r6, r2 + bf/s .aligned4_32 + add #32, r4 + +.aligned4_4: + mov #4, r2 + + /* Copy 4 bytes at a time until at most 4 bytes are left */ + mov.l @r5+, r0 + mov.l r0, @r4 + add #-4, r6 + cmp/ge r6, r2 + bf/s .aligned4_4 + add #4, r4 + + bra _naive_memcpy + nop + +.unaligned4: + /* Copy 4 bytes but read with movua.l since source is unaligned */ + movua.l @r5+, r0 + mov.l r0, @r4 + add #-4, r6 + cmp/ge r6, r2 + bf/s .unaligned4 + add #4, r4 + + bra _naive_memcpy + nop + +.aligned2: + mov.w @r5+, r0 + mov.w r0, @r4 + mov.w @r5+, r0 + mov.w r0, @(2,r4) + add #-4, r6 + cmp/ge r6, r2 + bf/s .aligned2 + add #4, r4 + + bra _naive_memcpy + nop + +_naive_memcpy: + mov.b @r5+, r0 + dt r6 + mov.b r0, @r4 + bf/s _naive_memcpy + add #1, r4 + + rts + mov r3, r0 + +.zero: + rts + mov r4, r0 + +.align 4 + +.___cpucap: + .long ___cpucap