From d6f606fa5c2c7b8da019910cc452d76ad2ac769b Mon Sep 17 00:00:00 2001 From: Lephenixnoir Date: Sun, 23 May 2021 14:30:35 +0200 Subject: [PATCH] string: add and test an optimized memchr (DONE) This version works on both SH3 and SH4. --- CMakeLists.txt | 3 +- STATUS | 19 +++++- src/libc/string/target/sh-generic/memchr.S | 68 ++++++++++++++++++++++ 3 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 src/libc/string/target/sh-generic/memchr.S diff --git a/CMakeLists.txt b/CMakeLists.txt index b7ad657..60f3334 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,7 +167,8 @@ endif() if(sh-generic IN_LIST TARGET_FOLDERS) list(APPEND SOURCES src/libc/setjmp/target/sh-generic/setjmp.S - src/libc/setjmp/target/sh-generic/longjmp.S) + src/libc/setjmp/target/sh-generic/longjmp.S + src/libc/string/target/sh-generic/memchr.S) endif() if(casiowin-fx IN_LIST TARGET_FOLDERS) diff --git a/STATUS b/STATUS index e106ba4..7a7ebd5 100644 --- a/STATUS +++ b/STATUS @@ -14,6 +14,14 @@ taken from the C99 standard (ISO/IEC 9899:1999), section 7 ("Library"). address of the function can be taken; don't rely on the macro being defined, as the user can remove it except in some special cases +String functions (mainly in ) can use 4-byte accesses, and in doing +so read up to 3 bytes after the end of the string if it is not padded (which +malloc'd strings and literal strings both are, leaving only stack-allocated and +statically-allocated ones). This allows important speed optimizations. The +extra access cannot trigger memory protection because there is no valid memory +less than 4 bytes before the end of any protection region. The extra access +might trigger the UBC in very specific scenarios, but we don't really care. + # Status In this file, every definition is classified in one of several implementation @@ -102,7 +110,7 @@ DONE: Function/symbol/macro is defined, builds, links, and is tested 7.21 7.21.2.1 memcpy: DONE - 7.21.2.2 memmove: DONE + 7.21.2.2 memmove: DONE (Unoptimized: byte-by-byte) ! 7.21.2.3 strcpy: TODO ! 7.21.2.4 strncpy: TODO ! 7.21.3.1 strcat: TODO @@ -112,7 +120,7 @@ DONE: Function/symbol/macro is defined, builds, links, and is tested ! 7.21.4.3 strcoll: TODO ! 7.21.4.4 strncmp: TODO ! 7.21.4.5 strxfrm: TODO -! 7.21.5.1 memchr: TODO + 7.21.5.1 memchr: DONE ! 7.21.5.2 strchr: TODO ! 7.21.5.3 strcspn: TODO ! 7.21.5.4 strpbrk: TODO @@ -123,6 +131,13 @@ DONE: Function/symbol/macro is defined, builds, links, and is tested 7.21.6.1 memset: DONE ! 7.21.6.2 strerror: TODO ! 7.21.6.3 strlen: TODO + Extensions: + - strnlen: TODO + - strchrnul: TODO + - strcasecmp: TODO + - strncasecmp: TODO + - strdup: TODO + - strndup: TODO 7.22 => GCC diff --git a/src/libc/string/target/sh-generic/memchr.S b/src/libc/string/target/sh-generic/memchr.S new file mode 100644 index 0000000..e9556a7 --- /dev/null +++ b/src/libc/string/target/sh-generic/memchr.S @@ -0,0 +1,68 @@ +.global _memchr +.type _memchr, @function + +_memchr: + mov r4, r0 + exts.b r5, r5 + + /* For small inputs, simply check bytes individually */ + mov #64, r2 + cmp/hi r6, r2 + bt .last + +.large: /* Make a 4-byte version of r5 for cmp/str */ + extu.b r5, r3 + swap.b r3, r2 + or r3, r2 + swap.w r2, r3 + or r3, r2 + + /* First check 3 bytes to ensure we don't skip bytes when aligning */ + mov.b @r0+, r1 + cmp/eq r1, r5 + bt .end + mov.b @r0+, r1 + cmp/eq r1, r5 + bt .end + mov.b @r0+, r1 + cmp/eq r1, r5 + bt .end + + /* Align to a 4-byte boundary */ + shlr2 r0 + shll2 r0 + add r4, r6 + sub r0, r6 + + mov r6, r7 + shlr2 r7 + mov #3, r3 + and r3, r6 + + /* Read longwords */ +1: mov.l @r0+, r1 + cmp/str r1, r2 + bt .found + dt r7 + bf 1b + +.last: /* Don't read if there are no bytes left */ + tst r6, r6 + bt .none + +2: mov.b @r0+, r1 + cmp/eq r1, r5 + bt .end + dt r6 + bf 2b + +.none: rts + mov #0, r0 + +.found: /* Go back to find out which of the last 4 bytes is r5 */ + add #-4, r0 + bra 2b + mov #4, r6 + +.end: rts + add #-1, r0