string: use gint's optimized memcpy (DONE)

This commit is contained in:
Lephenixnoir 2021-05-23 16:22:25 +02:00
parent b69e0fd299
commit a354e38ccf
Signed by: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
4 changed files with 135 additions and 8 deletions

View File

@ -172,6 +172,7 @@ if(sh-generic IN_LIST TARGET_FOLDERS)
src/libc/setjmp/target/sh-generic/setjmp.S
src/libc/setjmp/target/sh-generic/longjmp.S
src/libc/string/target/sh-generic/memchr.S
src/libc/string/target/sh-generic/memcpy.S
src/libc/string/target/sh-generic/memset.S
src/libc/string/target/sh-generic/strlen.S
src/target/sh-generic/cpucap.c)

View File

@ -1,17 +1,13 @@
#include <string.h>
#include <stdint.h>
/*
** The memcpy() function copies n bytes from memory area src to memory area dest.
** The memory areas must not overlap. Use memmove(3) if the memory areas do
** overlap.
**
** TODO: use DMA ?
** TODO: use DSP ?
*/
#ifndef __SUPPORT_ARCH_SH
void *memcpy(void *dest, const void *src, size_t count)
{
for (size_t i = 0; i < count; i = i + 1)
((uint8_t *) dest)[i] = ((uint8_t *) src)[i];
return (dest);
}
#endif /*__SUPPORT_ARCH_SH*/

View File

@ -119,5 +119,7 @@ _memchr:
rts
add #-1, r0
.align 4
.___cpucap:
.long ___cpucap

View File

@ -0,0 +1,128 @@
#include <bits/asm/cpucap.h>
.global _memcpy
.text
_memcpy:
tst r6, r6
bt .zero
mov r4, r3
mov #3, r2
/* When copying less than 64 bytes, use the naive method */
mov #64, r0
cmp/ge r6, r0
bt _naive_memcpy
_memcpy_align_dst:
/* 4-align the destination */
mov.b @r5+, r0
mov.b r0, @r4
add #1, r4
tst r2, r4
bf/s _memcpy_align_dst
dt r6
/* If source is 4-aligned, use mov.l */
tst r2, r5
bt/s .aligned4_32
mov #4, r2
/* If unaligned but SH4, use movua.l */
mov.l .___cpucap, r0
mov.l @r0, r0
tst #__CPUCAP_SH4ALDSP, r0
bf .unaligned4
/* If source is 2-aligned, use mov.w */
mov r5, r0
tst #1, r0
bt .aligned2
/* Otherwise use a naive copy */
bra _naive_memcpy
nop
.aligned4_32:
mov #36, r2
/* Copy 32 bytes at a time until at most 32 bytes are left */
mov.l @r5+, r0
mov.l @r5+, r1
mov.l @r5+, r7
mov.l r0, @r4
mov.l r1, @(4,r4)
mov.l r7, @(8,r4)
mov.l @r5+, r0
mov.l @r5+, r1
mov.l @r5+, r7
mov.l r0, @(12,r4)
mov.l r1, @(16,r4)
mov.l r7, @(20,r4)
mov.l @r5+, r0
mov.l @r5+, r1
add #-32, r6
mov.l r0, @(24,r4)
mov.l r1, @(28,r4)
cmp/ge r6, r2
bf/s .aligned4_32
add #32, r4
.aligned4_4:
mov #4, r2
/* Copy 4 bytes at a time until at most 4 bytes are left */
mov.l @r5+, r0
mov.l r0, @r4
add #-4, r6
cmp/ge r6, r2
bf/s .aligned4_4
add #4, r4
bra _naive_memcpy
nop
.unaligned4:
/* Copy 4 bytes but read with movua.l since source is unaligned */
movua.l @r5+, r0
mov.l r0, @r4
add #-4, r6
cmp/ge r6, r2
bf/s .unaligned4
add #4, r4
bra _naive_memcpy
nop
.aligned2:
mov.w @r5+, r0
mov.w r0, @r4
mov.w @r5+, r0
mov.w r0, @(2,r4)
add #-4, r6
cmp/ge r6, r2
bf/s .aligned2
add #4, r4
bra _naive_memcpy
nop
_naive_memcpy:
mov.b @r5+, r0
dt r6
mov.b r0, @r4
bf/s _naive_memcpy
add #1, r4
rts
mov r3, r0
.zero:
rts
mov r4, r0
.align 4
.___cpucap:
.long ___cpucap