From 9d1187b5b48aceaad754812485a9637b3401c940 Mon Sep 17 00:00:00 2001 From: Lephe Date: Sat, 4 Jul 2020 15:05:28 +0200 Subject: [PATCH] string: optimized memcpy, memcmp, memset; decent memmove This change adds optimized versions of the core memory functions, relying on 4-alignment, 2-alignment, and the SH4's unaligned move instruction to (hopefully) attain good performance in all situations. --- TODO | 3 - include/gint/std/string.h | 6 ++ src/std/memcmp.s | 114 +++++++++++++++++++++++++++++++++++++ src/std/memcpy.s | 98 +++++++++++++++++++++++++++++++ src/std/memmove.s | 60 +++++++++++++++++++ src/std/memory.c | 117 -------------------------------------- src/std/memset.s | 54 ++++++++++++++++++ 7 files changed, 332 insertions(+), 120 deletions(-) create mode 100644 src/std/memcmp.s create mode 100644 src/std/memcpy.s create mode 100644 src/std/memmove.s delete mode 100644 src/std/memory.c create mode 100644 src/std/memset.s diff --git a/TODO b/TODO index 9adb9b0..95da967 100644 --- a/TODO +++ b/TODO @@ -1,11 +1,8 @@ For the 2.1.0 release: -* core: the four basic memory functions (with automated tests) * bopti: remove the deprecated image_t definition * project: remove the compat branch -* core: remove the boot log Issues: -* #8 support fx-CG Manager * #10 support fx-CG 20 Extensions on existing code: diff --git a/include/gint/std/string.h b/include/gint/std/string.h index f16f90c..49c1b51 100644 --- a/include/gint/std/string.h +++ b/include/gint/std/string.h @@ -13,6 +13,12 @@ void *memcpy(void * restrict dest, void const * restrict src, size_t n); /* memset(): Fill a chunk of memory with a single byte */ void *memset(void *dest, int byte, size_t n); +/* memcpy(): Compare two chunks of memory */ +int memcmp(void const *s1, void const *s2, size_t n); + +/* memmove(): Copy a chunk of memory to a possibly overlapping destination */ +void *memmove(void *dest, void const *src, size_t n); + /* strlen(): Length of a NUL-terminated string */ size_t strlen(char const *str); diff --git a/src/std/memcmp.s b/src/std/memcmp.s new file mode 100644 index 0000000..2e595ef --- /dev/null +++ b/src/std/memcmp.s @@ -0,0 +1,114 @@ +.global _memcmp +.text + +_memcmp: + tst r6, r6 + bt .zero + + /* When comparing less than 64 bytes, use the naive method */ + mov #64, r0 + cmp/ge r6, r0 + bt _naive_memcmp + + mov #4, r2 + mov #3, r3 + +_memcmp_align_rhs: + /* 4-align the right-hand side */ + mov.b @r4+, r0 + mov.b @r5+, r1 + cmp/eq r0, r1 + bf/s .end + dt r6 + tst r3, r5 + bf _memcmp_align_rhs + + /* If left-hand side is 4-aligned, use mov.l */ + tst r3, r4 + bt .aligned4 + + /* If unaligned but SH4, use movua.l */ + mov.l .gint, r0 + mov.l @r0, r0 + tst #1, r0 + bt .unaligned4 + + /* If left-hand side is 2-aligned, use mov.w and mov.l */ + mov r4, r0 + tst #1, r0 + bt .aligned2 + + /* Otherwise use a naive comparison */ + bra _naive_memcmp + nop + +.aligned4: + /* Compare 4 bytes at a time until at most 4 bytes are left */ + mov.l @r4+, r0 + mov.l @r5+, r1 + cmp/eq r0, r1 + bf/s _fail + add #-4, r6 + cmp/ge r6, r2 + bf .aligned4 + + bra _naive_memcmp + nop + +.unaligned4: + /* Compare 4 bytes at a time until at most 4 bytes are left. Since + left-hand side is aligned, use movua.l */ + movua.l @r4+, r0 + mov.l @r5+, r1 + cmp/eq r0, r1 + bf/s _fail + add #-4, r6 + cmp/ge r6, r2 + bf .unaligned4 + + bra _naive_memcmp + nop + +.aligned2: + /* Read 4 bytes from r4 in two steps */ + mov.w @r4+, r0 + mov.l @r5+, r1 + mov.w @r4+, r2 + shll16 r0 + or r2, r0 + cmp/eq r0, r1 + bf/s _fail + add #-4, r6 + cmp/ge r6, r2 + bf .aligned2 + + bra _naive_memcmp + nop + +_fail: + /* Rewind 4 bytes to compare manually */ + add #-4, r4 + add #-4, r5 + add #4, r6 + +_naive_memcmp: + mov.b @r4+, r0 + mov.b @r5+, r1 + cmp/eq r0, r1 + bf/s .end + dt r6 + bf _naive_memcmp + +.end: + extu.b r0, r0 + extu.b r1, r1 + rts + sub r1, r0 + +.zero: + rts + mov #0, r0 + +.align 4 +.gint: + .long _gint diff --git a/src/std/memcpy.s b/src/std/memcpy.s new file mode 100644 index 0000000..e341a71 --- /dev/null +++ b/src/std/memcpy.s @@ -0,0 +1,98 @@ +.global _memcpy +.text + +_memcpy: + tst r6, r6 + bt .zero + + mov r4, r3 + mov #3, r2 + + /* When copying less than 64 bytes, use the naive method */ + mov #64, r0 + cmp/ge r6, r0 + bt _naive_memcpy + +_memcpy_align_dst: + /* 4-align the destination */ + mov.b @r5+, r0 + mov.b r0, @r4 + add #1, r4 + tst r2, r4 + bf/s _memcpy_align_dst + dt r6 + + /* If source is 4-aligned, use mov.l */ + tst r2, r5 + bt/s .aligned4 + mov #4, r2 + + /* If unaligned but SH4, use movua.l */ + mov.l .gint, r0 + mov.l @r0, r0 + tst #1, r0 + bt .unaligned4 + + /* If source is 2-aligned, use mov.w */ + mov r5, r0 + tst #1, r0 + bt .aligned2 + + /* Otherwise use a naive copy */ + bra _naive_memcpy + nop + +.aligned4: + /* Copy 4 bytes at a time until at most 4 bytes are left */ + mov.l @r5+, r0 + mov.l r0, @r4 + add #-4, r6 + cmp/ge r6, r2 + bf/s .aligned4 + add #4, r4 + + bra _naive_memcpy + nop + +.unaligned4: + /* Copy 4 bytes but read with movua.l since source is unaligned */ + movua.l @r5+, r0 + mov.l r0, @r4 + add #-4, r6 + cmp/ge r6, r2 + bf/s .unaligned4 + add #4, r4 + + bra _naive_memcpy + nop + +.aligned2: + mov.w @r5+, r0 + mov.w r0, @r4 + mov.w @r5+, r0 + mov.w r0, @(2,r4) + add #-4, r6 + cmp/ge r6, r2 + bf/s .aligned2 + add #4, r4 + + bra _naive_memcpy + nop + +_naive_memcpy: + mov.b @r5+, r0 + dt r6 + mov.b r0, @r4 + bf/s _naive_memcpy + add #1, r4 + + rts + mov r3, r0 + +.zero: + rts + mov r4, r0 + +.align 4 +.gint: + .long _gint diff --git a/src/std/memmove.s b/src/std/memmove.s new file mode 100644 index 0000000..e612541 --- /dev/null +++ b/src/std/memmove.s @@ -0,0 +1,60 @@ +.global _memmove +.text + +_memmove: + tst r6, r6 + bt .zero + + /* Simple optimization: if regions do not overlap, use memcpy() */ + mov r4, r0 + add r6, r0 + cmp/ge r0, r5 + bt _memmove_memcpy + mov r5, r0 + add r6, r0 + cmp/ge r0, r4 + bt _memmove_memcpy + + mov r4, r3 + + cmp/ge r4, r5 + bf .backwards + +.forwards: + /* If the destination starts before the source, copy forwards */ + mov.b @r5+, r0 + mov.b r0, @r4 + dt r6 + bf/s .forwards + add #1, r4 + + rts + mov r3, r0 + +.backwards: + /* Otherwise, copy backwards */ + add r6, r4 + add r6, r5 + +.backwards_loop: + add #-1, r5 + mov.b @r5, r0 + dt r6 + bf/s .backwards_loop + mov.b r0, @-r4 + + rts + mov r3, r0 + +_memmove_memcpy: + mov.l .memcpy, r1 + jmp @r1 + nop + +.zero: + rts + mov r4, r0 + +.align 4 +.memcpy: + .long _memcpy diff --git a/src/std/memory.c b/src/std/memory.c deleted file mode 100644 index be2c2de..0000000 --- a/src/std/memory.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include -#include - -static void memcpy4(uint32_t * restrict d, const void * restrict src, size_t n) -{ - int modulo = (uintptr_t)src & 3; - - /* Best case: perform 32-bit accesses only */ - if(!modulo) - { - const uint32_t *s = src; - for(; n; n-=4) *d++ = *s++; - } - -#if 0 - /* Here's where SH-3 and SH-4A start working differently. SH-4A has a - 2-cycle 'movua' instruction to perform unaligned reads */ - else if(isSH4()) - { - uint32_t longword; - const uint32_t *s = src; - - while(n--) - { - __asm__( - "movua.l %1, %0" - : "=z"(longword) - : "m>"(*s) - ); - s++; - *d++ = longword; - } - } -#endif - - /* On SH-3, we can only hope that there is 2-alignment */ - else if(!(modulo & 1)) - { - const uint16_t *s = src; - uint16_t * restrict dst = (void *)d; - - for(; n; n-=2) - { - *dst++ = *s++; - *dst++ = *s++; - } - } - - /* Or just perform the raw copy */ - else - { - const uint8_t *s = src; - uint8_t * restrict dst = (void *)d; - - while(n--) *dst++ = *s++; - } -} - -void *memcpy(void * restrict dst, const void * restrict src, size_t n) -{ - uint8_t *d = dst; - const uint8_t *s = src; - - /* Small areas: don't bother with complex methods */ - if(n < 32) - { - while(n--) *d++ = *s++; - return dst; - } - - /* Find a longword offset to perform word or longword operations */ - while((uintptr_t)d & 3) *d++ = *s++, n--; - - /* Perform the big, efficient copy */ - memcpy4((void *)d, s, n & ~3); - - size_t m = n & 3; - d += (n - m); - s += (n - m); - n = m; - - /* Copy around the last bytes */ - while(n--) *d++ = *s++; - return dst; -} - -void *_memmove(GUNUSED void *dst, GUNUSED const void *src, GUNUSED size_t n) -{ - // (same as memcpy, but heed for direction if areas overlap) - - // copy by increasing addresses if dst < src - // copy by decreasing addresses if dst > src - return dst; -} - -int memcmp(GUNUSED const void *s1, GUNUSED const void *s2, GUNUSED size_t n) -{ - uint8_t const *p1 = s1; - uint8_t const *p2 = s2; - - for(size_t i = 0; i < n; i++) - { - if(p1[i] != p2[i]) return (p1[i] - p2[i]); - } - - return 0; -} - -void *memset(void *s, int byte, size_t n) -{ - /* TODO: Do it efficiently */ - char *dst = s; - while(n--) *dst++ = byte; - return s; -} diff --git a/src/std/memset.s b/src/std/memset.s new file mode 100644 index 0000000..183c64d --- /dev/null +++ b/src/std/memset.s @@ -0,0 +1,54 @@ +.global _memset +.text + +_memset: + tst r6, r6 + bt .zero + + /* We'll fill from the end */ + mov r4, r3 + add r6, r4 + + /* When setting less than 64 bytes, use the naive method */ + mov #64, r0 + cmp/ge r6, r0 + bt _naive_memset + + mov #3, r2 + + /* Make a 4-byte filler */ + mov r5, r0 + shll8 r5 + or r5, r0 + mov r0, r5 + shll16 r5 + or r5, r0 + +_memset_align: + /* 4-align the destination */ + mov.b r0, @-r4 + tst r2, r4 + bf/s _memset_align + dt r6 + + mov #8, r2 + +.aligned4: + mov.l r0, @-r4 + cmp/ge r6, r2 + bf/s .aligned4 + add #-4, r6 + +_naive_memset: + /* Tight loop copy one byte */ + dt r6 + bf/s _naive_memset + mov.b r5, @-r4 + +.end: + rts + mov r3, r0 + +.zero: + rts + mov r4, r0