string: optimized memcpy, memcmp, memset; decent memmove

This change adds optimized versions of the core memory functions,
relying on 4-alignment, 2-alignment, and the SH4's unaligned move
instruction to (hopefully) attain good performance in all situations.
This commit is contained in:
Lephe 2020-07-04 15:05:28 +02:00
parent 7d63a1b536
commit 9d1187b5b4
Signed by untrusted user: Lephenixnoir
GPG Key ID: 1BBA026E13FC0495
7 changed files with 332 additions and 120 deletions

3
TODO
View File

@ -1,11 +1,8 @@
For the 2.1.0 release:
* core: the four basic memory functions (with automated tests)
* bopti: remove the deprecated image_t definition
* project: remove the compat branch
* core: remove the boot log
Issues:
* #8 support fx-CG Manager
* #10 support fx-CG 20
Extensions on existing code:

View File

@ -13,6 +13,12 @@ void *memcpy(void * restrict dest, void const * restrict src, size_t n);
/* memset(): Fill a chunk of memory with a single byte */
void *memset(void *dest, int byte, size_t n);
/* memcpy(): Compare two chunks of memory */
int memcmp(void const *s1, void const *s2, size_t n);
/* memmove(): Copy a chunk of memory to a possibly overlapping destination */
void *memmove(void *dest, void const *src, size_t n);
/* strlen(): Length of a NUL-terminated string */
size_t strlen(char const *str);

114
src/std/memcmp.s Normal file
View File

@ -0,0 +1,114 @@
.global _memcmp
.text
_memcmp:
tst r6, r6
bt .zero
/* When comparing less than 64 bytes, use the naive method */
mov #64, r0
cmp/ge r6, r0
bt _naive_memcmp
mov #4, r2
mov #3, r3
_memcmp_align_rhs:
/* 4-align the right-hand side */
mov.b @r4+, r0
mov.b @r5+, r1
cmp/eq r0, r1
bf/s .end
dt r6
tst r3, r5
bf _memcmp_align_rhs
/* If left-hand side is 4-aligned, use mov.l */
tst r3, r4
bt .aligned4
/* If unaligned but SH4, use movua.l */
mov.l .gint, r0
mov.l @r0, r0
tst #1, r0
bt .unaligned4
/* If left-hand side is 2-aligned, use mov.w and mov.l */
mov r4, r0
tst #1, r0
bt .aligned2
/* Otherwise use a naive comparison */
bra _naive_memcmp
nop
.aligned4:
/* Compare 4 bytes at a time until at most 4 bytes are left */
mov.l @r4+, r0
mov.l @r5+, r1
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .aligned4
bra _naive_memcmp
nop
.unaligned4:
/* Compare 4 bytes at a time until at most 4 bytes are left. Since
left-hand side is aligned, use movua.l */
movua.l @r4+, r0
mov.l @r5+, r1
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .unaligned4
bra _naive_memcmp
nop
.aligned2:
/* Read 4 bytes from r4 in two steps */
mov.w @r4+, r0
mov.l @r5+, r1
mov.w @r4+, r2
shll16 r0
or r2, r0
cmp/eq r0, r1
bf/s _fail
add #-4, r6
cmp/ge r6, r2
bf .aligned2
bra _naive_memcmp
nop
_fail:
/* Rewind 4 bytes to compare manually */
add #-4, r4
add #-4, r5
add #4, r6
_naive_memcmp:
mov.b @r4+, r0
mov.b @r5+, r1
cmp/eq r0, r1
bf/s .end
dt r6
bf _naive_memcmp
.end:
extu.b r0, r0
extu.b r1, r1
rts
sub r1, r0
.zero:
rts
mov #0, r0
.align 4
.gint:
.long _gint

98
src/std/memcpy.s Normal file
View File

@ -0,0 +1,98 @@
.global _memcpy
.text
_memcpy:
tst r6, r6
bt .zero
mov r4, r3
mov #3, r2
/* When copying less than 64 bytes, use the naive method */
mov #64, r0
cmp/ge r6, r0
bt _naive_memcpy
_memcpy_align_dst:
/* 4-align the destination */
mov.b @r5+, r0
mov.b r0, @r4
add #1, r4
tst r2, r4
bf/s _memcpy_align_dst
dt r6
/* If source is 4-aligned, use mov.l */
tst r2, r5
bt/s .aligned4
mov #4, r2
/* If unaligned but SH4, use movua.l */
mov.l .gint, r0
mov.l @r0, r0
tst #1, r0
bt .unaligned4
/* If source is 2-aligned, use mov.w */
mov r5, r0
tst #1, r0
bt .aligned2
/* Otherwise use a naive copy */
bra _naive_memcpy
nop
.aligned4:
/* Copy 4 bytes at a time until at most 4 bytes are left */
mov.l @r5+, r0
mov.l r0, @r4
add #-4, r6
cmp/ge r6, r2
bf/s .aligned4
add #4, r4
bra _naive_memcpy
nop
.unaligned4:
/* Copy 4 bytes but read with movua.l since source is unaligned */
movua.l @r5+, r0
mov.l r0, @r4
add #-4, r6
cmp/ge r6, r2
bf/s .unaligned4
add #4, r4
bra _naive_memcpy
nop
.aligned2:
mov.w @r5+, r0
mov.w r0, @r4
mov.w @r5+, r0
mov.w r0, @(2,r4)
add #-4, r6
cmp/ge r6, r2
bf/s .aligned2
add #4, r4
bra _naive_memcpy
nop
_naive_memcpy:
mov.b @r5+, r0
dt r6
mov.b r0, @r4
bf/s _naive_memcpy
add #1, r4
rts
mov r3, r0
.zero:
rts
mov r4, r0
.align 4
.gint:
.long _gint

60
src/std/memmove.s Normal file
View File

@ -0,0 +1,60 @@
.global _memmove
.text
_memmove:
tst r6, r6
bt .zero
/* Simple optimization: if regions do not overlap, use memcpy() */
mov r4, r0
add r6, r0
cmp/ge r0, r5
bt _memmove_memcpy
mov r5, r0
add r6, r0
cmp/ge r0, r4
bt _memmove_memcpy
mov r4, r3
cmp/ge r4, r5
bf .backwards
.forwards:
/* If the destination starts before the source, copy forwards */
mov.b @r5+, r0
mov.b r0, @r4
dt r6
bf/s .forwards
add #1, r4
rts
mov r3, r0
.backwards:
/* Otherwise, copy backwards */
add r6, r4
add r6, r5
.backwards_loop:
add #-1, r5
mov.b @r5, r0
dt r6
bf/s .backwards_loop
mov.b r0, @-r4
rts
mov r3, r0
_memmove_memcpy:
mov.l .memcpy, r1
jmp @r1
nop
.zero:
rts
mov r4, r0
.align 4
.memcpy:
.long _memcpy

View File

@ -1,117 +0,0 @@
#include <gint/defs/attributes.h>
#include <gint/hardware.h>
#include <stddef.h>
#include <stdint.h>
static void memcpy4(uint32_t * restrict d, const void * restrict src, size_t n)
{
int modulo = (uintptr_t)src & 3;
/* Best case: perform 32-bit accesses only */
if(!modulo)
{
const uint32_t *s = src;
for(; n; n-=4) *d++ = *s++;
}
#if 0
/* Here's where SH-3 and SH-4A start working differently. SH-4A has a
2-cycle 'movua' instruction to perform unaligned reads */
else if(isSH4())
{
uint32_t longword;
const uint32_t *s = src;
while(n--)
{
__asm__(
"movua.l %1, %0"
: "=z"(longword)
: "m>"(*s)
);
s++;
*d++ = longword;
}
}
#endif
/* On SH-3, we can only hope that there is 2-alignment */
else if(!(modulo & 1))
{
const uint16_t *s = src;
uint16_t * restrict dst = (void *)d;
for(; n; n-=2)
{
*dst++ = *s++;
*dst++ = *s++;
}
}
/* Or just perform the raw copy */
else
{
const uint8_t *s = src;
uint8_t * restrict dst = (void *)d;
while(n--) *dst++ = *s++;
}
}
void *memcpy(void * restrict dst, const void * restrict src, size_t n)
{
uint8_t *d = dst;
const uint8_t *s = src;
/* Small areas: don't bother with complex methods */
if(n < 32)
{
while(n--) *d++ = *s++;
return dst;
}
/* Find a longword offset to perform word or longword operations */
while((uintptr_t)d & 3) *d++ = *s++, n--;
/* Perform the big, efficient copy */
memcpy4((void *)d, s, n & ~3);
size_t m = n & 3;
d += (n - m);
s += (n - m);
n = m;
/* Copy around the last bytes */
while(n--) *d++ = *s++;
return dst;
}
void *_memmove(GUNUSED void *dst, GUNUSED const void *src, GUNUSED size_t n)
{
// (same as memcpy, but heed for direction if areas overlap)
// copy by increasing addresses if dst < src
// copy by decreasing addresses if dst > src
return dst;
}
int memcmp(GUNUSED const void *s1, GUNUSED const void *s2, GUNUSED size_t n)
{
uint8_t const *p1 = s1;
uint8_t const *p2 = s2;
for(size_t i = 0; i < n; i++)
{
if(p1[i] != p2[i]) return (p1[i] - p2[i]);
}
return 0;
}
void *memset(void *s, int byte, size_t n)
{
/* TODO: Do it efficiently */
char *dst = s;
while(n--) *dst++ = byte;
return s;
}

54
src/std/memset.s Normal file
View File

@ -0,0 +1,54 @@
.global _memset
.text
_memset:
tst r6, r6
bt .zero
/* We'll fill from the end */
mov r4, r3
add r6, r4
/* When setting less than 64 bytes, use the naive method */
mov #64, r0
cmp/ge r6, r0
bt _naive_memset
mov #3, r2
/* Make a 4-byte filler */
mov r5, r0
shll8 r5
or r5, r0
mov r0, r5
shll16 r5
or r5, r0
_memset_align:
/* 4-align the destination */
mov.b r0, @-r4
tst r2, r4
bf/s _memset_align
dt r6
mov #8, r2
.aligned4:
mov.l r0, @-r4
cmp/ge r6, r2
bf/s .aligned4
add #-4, r6
_naive_memset:
/* Tight loop copy one byte */
dt r6
bf/s _naive_memset
mov.b r5, @-r4
.end:
rts
mov r3, r0
.zero:
rts
mov r4, r0