diff --git a/CMakeLists.txt b/CMakeLists.txt index f58f023..bf94e53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,10 +67,6 @@ set(SOURCES_COMMON src/spu/spu.c src/std/aprint.c src/std/malloc.c - src/std/memcmp.s - src/std/memcpy.s - src/std/memmove.s - src/std/memset.s src/std/print.c src/std/string.c src/std/string-ext.c diff --git a/src/std/memcmp.s b/src/std/memcmp.s deleted file mode 100644 index 2e595ef..0000000 --- a/src/std/memcmp.s +++ /dev/null @@ -1,114 +0,0 @@ -.global _memcmp -.text - -_memcmp: - tst r6, r6 - bt .zero - - /* When comparing less than 64 bytes, use the naive method */ - mov #64, r0 - cmp/ge r6, r0 - bt _naive_memcmp - - mov #4, r2 - mov #3, r3 - -_memcmp_align_rhs: - /* 4-align the right-hand side */ - mov.b @r4+, r0 - mov.b @r5+, r1 - cmp/eq r0, r1 - bf/s .end - dt r6 - tst r3, r5 - bf _memcmp_align_rhs - - /* If left-hand side is 4-aligned, use mov.l */ - tst r3, r4 - bt .aligned4 - - /* If unaligned but SH4, use movua.l */ - mov.l .gint, r0 - mov.l @r0, r0 - tst #1, r0 - bt .unaligned4 - - /* If left-hand side is 2-aligned, use mov.w and mov.l */ - mov r4, r0 - tst #1, r0 - bt .aligned2 - - /* Otherwise use a naive comparison */ - bra _naive_memcmp - nop - -.aligned4: - /* Compare 4 bytes at a time until at most 4 bytes are left */ - mov.l @r4+, r0 - mov.l @r5+, r1 - cmp/eq r0, r1 - bf/s _fail - add #-4, r6 - cmp/ge r6, r2 - bf .aligned4 - - bra _naive_memcmp - nop - -.unaligned4: - /* Compare 4 bytes at a time until at most 4 bytes are left. Since - left-hand side is aligned, use movua.l */ - movua.l @r4+, r0 - mov.l @r5+, r1 - cmp/eq r0, r1 - bf/s _fail - add #-4, r6 - cmp/ge r6, r2 - bf .unaligned4 - - bra _naive_memcmp - nop - -.aligned2: - /* Read 4 bytes from r4 in two steps */ - mov.w @r4+, r0 - mov.l @r5+, r1 - mov.w @r4+, r2 - shll16 r0 - or r2, r0 - cmp/eq r0, r1 - bf/s _fail - add #-4, r6 - cmp/ge r6, r2 - bf .aligned2 - - bra _naive_memcmp - nop - -_fail: - /* Rewind 4 bytes to compare manually */ - add #-4, r4 - add #-4, r5 - add #4, r6 - -_naive_memcmp: - mov.b @r4+, r0 - mov.b @r5+, r1 - cmp/eq r0, r1 - bf/s .end - dt r6 - bf _naive_memcmp - -.end: - extu.b r0, r0 - extu.b r1, r1 - rts - sub r1, r0 - -.zero: - rts - mov #0, r0 - -.align 4 -.gint: - .long _gint diff --git a/src/std/memcpy.s b/src/std/memcpy.s deleted file mode 100644 index c706ac4..0000000 --- a/src/std/memcpy.s +++ /dev/null @@ -1,125 +0,0 @@ -.global _memcpy -.text - -_memcpy: - tst r6, r6 - bt .zero - - mov r4, r3 - mov #3, r2 - - /* When copying less than 64 bytes, use the naive method */ - mov #64, r0 - cmp/ge r6, r0 - bt _naive_memcpy - -_memcpy_align_dst: - /* 4-align the destination */ - mov.b @r5+, r0 - mov.b r0, @r4 - add #1, r4 - tst r2, r4 - bf/s _memcpy_align_dst - dt r6 - - /* If source is 4-aligned, use mov.l */ - tst r2, r5 - bt/s .aligned4_32 - mov #4, r2 - - /* If unaligned but SH4, use movua.l */ - mov.l .gint, r0 - mov.l @r0, r0 - tst #1, r0 - bt .unaligned4 - - /* If source is 2-aligned, use mov.w */ - mov r5, r0 - tst #1, r0 - bt .aligned2 - - /* Otherwise use a naive copy */ - bra _naive_memcpy - nop - -.aligned4_32: - mov #36, r2 - - /* Copy 32 bytes at a time until at most 32 bytes are left */ - mov.l @r5+, r0 - mov.l @r5+, r1 - mov.l @r5+, r7 - mov.l r0, @r4 - mov.l r1, @(4,r4) - mov.l r7, @(8,r4) - mov.l @r5+, r0 - mov.l @r5+, r1 - mov.l @r5+, r7 - mov.l r0, @(12,r4) - mov.l r1, @(16,r4) - mov.l r7, @(20,r4) - mov.l @r5+, r0 - mov.l @r5+, r1 - add #-32, r6 - mov.l r0, @(24,r4) - mov.l r1, @(28,r4) - cmp/ge r6, r2 - bf/s .aligned4_32 - add #32, r4 - -.aligned4_4: - mov #4, r2 - - /* Copy 4 bytes at a time until at most 4 bytes are left */ - mov.l @r5+, r0 - mov.l r0, @r4 - add #-4, r6 - cmp/ge r6, r2 - bf/s .aligned4_4 - add #4, r4 - - bra _naive_memcpy - nop - -.unaligned4: - /* Copy 4 bytes but read with movua.l since source is unaligned */ - movua.l @r5+, r0 - mov.l r0, @r4 - add #-4, r6 - cmp/ge r6, r2 - bf/s .unaligned4 - add #4, r4 - - bra _naive_memcpy - nop - -.aligned2: - mov.w @r5+, r0 - mov.w r0, @r4 - mov.w @r5+, r0 - mov.w r0, @(2,r4) - add #-4, r6 - cmp/ge r6, r2 - bf/s .aligned2 - add #4, r4 - - bra _naive_memcpy - nop - -_naive_memcpy: - mov.b @r5+, r0 - dt r6 - mov.b r0, @r4 - bf/s _naive_memcpy - add #1, r4 - - rts - mov r3, r0 - -.zero: - rts - mov r4, r0 - -.align 4 -.gint: - .long _gint diff --git a/src/std/memmove.s b/src/std/memmove.s deleted file mode 100644 index e612541..0000000 --- a/src/std/memmove.s +++ /dev/null @@ -1,60 +0,0 @@ -.global _memmove -.text - -_memmove: - tst r6, r6 - bt .zero - - /* Simple optimization: if regions do not overlap, use memcpy() */ - mov r4, r0 - add r6, r0 - cmp/ge r0, r5 - bt _memmove_memcpy - mov r5, r0 - add r6, r0 - cmp/ge r0, r4 - bt _memmove_memcpy - - mov r4, r3 - - cmp/ge r4, r5 - bf .backwards - -.forwards: - /* If the destination starts before the source, copy forwards */ - mov.b @r5+, r0 - mov.b r0, @r4 - dt r6 - bf/s .forwards - add #1, r4 - - rts - mov r3, r0 - -.backwards: - /* Otherwise, copy backwards */ - add r6, r4 - add r6, r5 - -.backwards_loop: - add #-1, r5 - mov.b @r5, r0 - dt r6 - bf/s .backwards_loop - mov.b r0, @-r4 - - rts - mov r3, r0 - -_memmove_memcpy: - mov.l .memcpy, r1 - jmp @r1 - nop - -.zero: - rts - mov r4, r0 - -.align 4 -.memcpy: - .long _memcpy diff --git a/src/std/memset.s b/src/std/memset.s deleted file mode 100644 index 4aa6a8a..0000000 --- a/src/std/memset.s +++ /dev/null @@ -1,70 +0,0 @@ -.global _memset -.text - -_memset: - tst r6, r6 - bt .zero - - /* We'll fill from the end */ - mov r4, r3 - add r6, r4 - - /* When setting less than 64 bytes, use the naive method */ - mov #64, r0 - cmp/ge r6, r0 - bt _naive_memset - - mov #3, r2 - - /* Make a 4-byte filler */ - mov r5, r0 - shll8 r5 - or r5, r0 - mov r0, r5 - shll16 r5 - or r5, r0 - -_memset_align: - /* 4-align the destination */ - mov.b r0, @-r4 - tst r2, r4 - bf/s _memset_align - dt r6 - - mov #40, r2 - -.aligned4_32: - add #-32, r4 - add #-32, r6 - mov.l r0, @(28,r4) - mov.l r0, @(24,r4) - mov.l r0, @(20,r4) - mov.l r0, @(16,r4) - mov.l r0, @(12,r4) - mov.l r0, @(8,r4) - mov.l r0, @(4,r4) - cmp/ge r6, r2 - bf/s .aligned4_32 - mov.l r0, @r4 - - mov #8, r2 - -.aligned4_4: - mov.l r0, @-r4 - cmp/ge r6, r2 - bf/s .aligned4_4 - add #-4, r6 - -_naive_memset: - /* Tight loop copy one byte */ - dt r6 - bf/s _naive_memset - mov.b r5, @-r4 - -.end: - rts - mov r3, r0 - -.zero: - rts - mov r4, r0