diff --git a/newlib/ChangeLog b/newlib/ChangeLog index 0bfad1c01..c389318f9 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,10 @@ +2013-04-12 Will Newton + + * libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned + access is not enabled. + * libc/machine/arm/memcpy.S: Faster memcpy implementation for + Cortex A15 cores using NEON and VFP if available. + 2013-04-12 Bin Cheng * acconfig.h (_WCHAR_ORIENT): Undef diff --git a/newlib/libc/machine/arm/memcpy-stub.c b/newlib/libc/machine/arm/memcpy-stub.c index 536b869cc..513631a9f 100644 --- a/newlib/libc/machine/arm/memcpy-stub.c +++ b/newlib/libc/machine/arm/memcpy-stub.c @@ -29,7 +29,7 @@ /* The sole purpose of this file is to include the plain memcpy provided in newlib. An optimized version of memcpy is provided in the assembly file memcpy.S in this directory. */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \ - (!(defined (__ARM_ARCH_7A__)))) + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)))) #include "../../string/memcpy.c" diff --git a/newlib/libc/machine/arm/memcpy.S b/newlib/libc/machine/arm/memcpy.S index e408ed0e0..bc54bb3f5 100644 --- a/newlib/libc/machine/arm/memcpy.S +++ b/newlib/libc/machine/arm/memcpy.S @@ -1,423 +1,625 @@ -/* - * Copyright (c) 2011 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/* Copyright (c) 2013, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Linaro Limited nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This memcpy routine is optimised for Cortex-A15 cores and takes advantage + of VFP or NEON when built with the appropriate flags. + + Assumptions: + + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + LDRD/STRD support unaligned word accesses + */ #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \ - (!(defined (__ARM_ARCH_7A__)))) + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)))) /* Do nothing here. See memcpy-stub.c in the same directory. */ #else - /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */ - /* Use the version of memcpy implemented using LDRD and STRD. - This version is tuned for Cortex-A15. - This might not be the best for other ARMv7-A CPUs, - but there is no predefine to distinguish between - different CPUs in the same architecture, - and this version is better than the plain memcpy provided in newlib. + .syntax unified + /* This implementation requires ARM state. */ + .arm - Therefore, we use this version for all ARMv7-A CPUS. */ +#ifdef __ARM_NEON__ - /* To make the same code compile for both ARM and Thumb instruction - sets, switch to unified syntax at the beginning of this function. - However, by using the same code, we may be missing optimization - opportunities. For instance, in LDRD/STRD instructions, the first - destination register must be even and the second consecutive in - ARM state, but not in Thumb state. */ + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON - .syntax unified +#elif !defined (__SOFTFP__) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 -#if defined (__thumb__) - .thumb - .thumb_func #endif - .global memcpy - .type memcpy, %function -memcpy: +/* Old versions of GAS incorrectly implement the NEON align semantics. */ +#ifdef BROKEN_ASM_NEON_ALIGN +#define ALIGN(addr, align) addr,:align +#else +#define ALIGN(addr, align) addr:align +#endif - /* Assumes that n >= 0, and dst, src are valid pointers. - If there is at least 8 bytes to copy, use LDRD/STRD. - If src and dst are misaligned with different offsets, - first copy byte by byte until dst is aligned, - and then copy using LDRD/STRD and shift if needed. - When less than 8 left, copy a word and then byte by byte. */ +#define PC_OFFSET 8 /* PC pipeline compensation. */ +#define INSN_SIZE 4 - /* Save registers (r0 holds the return value): - optimized push {r0, r4, r5, lr}. - To try and improve performance, stack layout changed, - i.e., not keeping the stack looking like users expect - (highest numbered register at highest address). */ - push {r0, lr} - strd r4, r5, [sp, #-8]! +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 - /* TODO: Add debug frame directives. - We don't need exception unwind directives, because the code below - does not throw any exceptions and does not call any other functions. - Generally, newlib functions like this lack debug information for - assembler source. */ +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r10 - /* Get copying of tiny blocks out of the way first. */ - /* Is there at least 4 bytes to copy? */ - subs r2, r2, #4 - blt copy_less_than_4 /* If n < 4. */ +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +#define D_l r8 +#define D_h r9 +#endif - /* Check word alignment. */ - ands ip, r0, #3 /* ip = last 2 bits of dst. */ - bne dst_not_word_aligned /* If dst is not word-aligned. */ +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ - /* Get here if dst is word-aligned. */ - ands ip, r1, #3 /* ip = last 2 bits of src. */ - bne src_not_word_aligned /* If src is not word-aligned. */ -word_aligned: - /* Get here if source and dst both are word-aligned. - The number of bytes remaining to copy is r2+4. */ +#define prefetch_lines 5 - /* Is there is at least 64 bytes to copy? */ - subs r2, r2, #60 - blt copy_less_than_64 /* If r2 + 4 < 64. */ +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm - /* First, align the destination buffer to 8-bytes, - to make sure double loads and stores don't cross cache line boundary, - as they are then more expensive even if the data is in the cache - (require two load/store issue cycles instead of one). - If only one of the buffers is not 8-bytes aligned, - then it's more important to align dst than src, - because there is more penalty for stores - than loads that cross cacheline boundary. - This check and realignment are only worth doing - if there is a lot to copy. */ + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif - /* Get here if dst is word aligned, - i.e., the 2 least significant bits are 0. - If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), - then copy 1 word (4 bytes). */ - ands r3, r0, #4 - beq 11f /* If dst already two-word aligned. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - subs r2, r2, #4 - blt copy_less_than_64 + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm -11: - /* TODO: Align to cacheline (useful for PLD optimization). */ +def_fn memcpy p2align=6 + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + and tmp1, count, #0x38 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + vld1.8 {d0}, [src]! /* 14 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 12 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 10 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 8 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 6 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 4 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 2 words to go. */ + vst1.8 {d0}, [dst]! + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) + /* Jump directly into the sequence below at the correct offset. */ + add pc, pc, tmp1, lsl #1 + + ldr tmp1, [src, #-60] /* 15 words to go. */ + str tmp1, [dst, #-60] + + ldr tmp1, [src, #-56] /* 14 words to go. */ + str tmp1, [dst, #-56] + ldr tmp1, [src, #-52] + str tmp1, [dst, #-52] + + ldr tmp1, [src, #-48] /* 12 words to go. */ + str tmp1, [dst, #-48] + ldr tmp1, [src, #-44] + str tmp1, [dst, #-44] + + ldr tmp1, [src, #-40] /* 10 words to go. */ + str tmp1, [dst, #-40] + ldr tmp1, [src, #-36] + str tmp1, [dst, #-36] + + ldr tmp1, [src, #-32] /* 8 words to go. */ + str tmp1, [dst, #-32] + ldr tmp1, [src, #-28] + str tmp1, [dst, #-28] + + ldr tmp1, [src, #-24] /* 6 words to go. */ + str tmp1, [dst, #-24] + ldr tmp1, [src, #-20] + str tmp1, [dst, #-20] + + ldr tmp1, [src, #-16] /* 4 words to go. */ + str tmp1, [dst, #-16] + ldr tmp1, [src, #-12] + str tmp1, [dst, #-12] + + ldr tmp1, [src, #-8] /* 2 words to go. */ + str tmp1, [dst, #-8] + ldr tmp1, [src, #-4] + str tmp1, [dst, #-4] +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + and tmp2, src, #3 + and tmp1, dst, #3 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 32-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 - /* Every loop iteration copies 64 bytes. */ 1: - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 - ldrd r4, r5, [r1, \offset] - strd r4, r5, [r0, \offset] - .endr + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned - add r0, r0, #64 - add r1, r1, #64 - subs r2, r2, #64 - bge 1b /* If there is more to copy. */ + cmp tmp2, #512 + bge .Lcpy_body_long -copy_less_than_64: +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. - Restore the count if there is more than 7 bytes to copy. */ - adds r2, r2, #56 - blt copy_less_than_8 +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + + vldr d0, [src, #-56] /* 14 words to go. */ + vstr d0, [dst, #-56] + vldr d0, [src, #-48] /* 12 words to go. */ + vstr d0, [dst, #-48] + vldr d0, [src, #-40] /* 10 words to go. */ + vstr d0, [dst, #-40] + vldr d0, [src, #-32] /* 8 words to go. */ + vstr d0, [dst, #-32] + vldr d0, [src, #-24] /* 6 words to go. */ + vstr d0, [dst, #-24] + vldr d0, [src, #-16] /* 4 words to go. */ + vstr d0, [dst, #-16] + vldr d0, [src, #-8] /* 2 words to go. */ + vstr d0, [dst, #-8] +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + bx lr +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 32-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ + strd A_l, A_h, [dst, #-56] + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ + strd A_l, A_h, [dst, #-48] + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ + strd A_l, A_h, [dst, #-40] + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ + strd A_l, A_h, [dst, #-32] + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ + strd A_l, A_h, [dst, #-24] + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ + strd A_l, A_h, [dst, #-16] + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ + strd A_l, A_h, [dst, #-8] + +#endif + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + bx lr + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b - /* Copy 8 bytes at a time. */ 2: - ldrd r4, r5, [r1], #8 - strd r4, r5, [r0], #8 - subs r2, r2, #8 - bge 2b /* If there is more to copy. */ + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + bx lr +#endif -copy_less_than_8: +.Lcpy_notaligned: + pld [src] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + pld [src, #(4 * 64)] - /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. - Check if there is more to copy. */ - cmn r2, #8 - beq return /* If r2 + 8 == 0. */ +#ifdef USE_NEON + vld1.8 {d0-d3}, [src]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bmi 2f +1: + pld [src, #(4 * 64)] + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vld1.8 {d0-d3}, [src]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bpl 1b +2: + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b - /* Restore the count if there is more than 3 bytes to copy. */ - adds r2, r2, #4 - blt copy_less_than_4 + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + bne .Ltail63unaligned + bx lr - /* Copy 4 bytes. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - -copy_less_than_4: - /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ - - /* Restore the count, check if there is more to copy. */ - adds r2, r2, #4 - beq return /* If r2 == 0. */ - - /* Get here with r2 is in {1,2,3}={01,10,11}. */ - /* Logical shift left r2, insert 0s, update flags. */ - lsls r2, r2, #31 - - /* Copy byte by byte. - Condition ne means the last bit of r2 is 0. - Condition cs means the second to last bit of r2 is set, - i.e., r2 is 1 or 3. */ - itt ne - ldrbne r3, [r1], #1 - strbne r3, [r0], #1 - - itttt cs - ldrbcs r4, [r1], #1 - ldrbcs r5, [r1] - strbcs r4, [r0], #1 - strbcs r5, [r0] - -return: - /* Restore registers: optimized pop {r0, r4, r5, pc} */ - ldrd r4, r5, [sp], #8 - pop {r0, pc} /* This is the only return point of memcpy. */ - -#ifndef __ARM_FEATURE_UNALIGNED - - /* The following assembly macro implements misaligned copy in software. - Assumes that dst is word aligned, src is at offset "pull" bits from - word, push = 32 - pull, and the number of bytes that remain to copy - is r2 + 4, r2 >= 0. */ - - /* In the code below, r2 is the number of bytes that remain to be - written. The number of bytes read is always larger, because we have - partial words in the shift queue. */ - - .macro miscopy pull push shiftleft shiftright - - /* Align src to the previous word boundary. */ - bic r1, r1, #3 - - /* Initialize the shift queue. */ - ldr r5, [r1], #4 /* Load a word from source. */ - - subs r2, r2, #4 - blt 6f /* Go to misaligned copy of less than 8 bytes. */ - - /* Get here if there is more than 8 bytes to copy. - The number of bytes to copy is r2+8, r2 >= 0. */ - - /* Save registers: push { r6, r7 }. - We need additional registers for LDRD and STRD, because in ARM state - the first destination register must be even and the second - consecutive. */ - strd r6, r7, [sp, #-8]! - - subs r2, r2, #56 - blt 4f /* Go to misaligned copy of less than 64 bytes. */ - -3: - /* Get here if there is more than 64 bytes to copy. - The number of bytes to copy is r2+64, r2 >= 0. */ - - /* Copy 64 bytes in every iteration. - Use a partial word from the shift queue. */ - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 - mov r6, r5, \shiftleft #\pull - ldrd r4, r5, [r1, \offset] - orr r6, r6, r4, \shiftright #\push - mov r7, r4, \shiftleft #\pull - orr r7, r7, r5, \shiftright #\push - strd r6, r7, [r0, \offset] - .endr - - add r1, r1, #64 - add r0, r0, #64 - subs r2, r2, #64 - bge 3b - -4: - /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0) - and they are misaligned. */ - - /* Restore the count if there is more than 7 bytes to copy. */ - adds r2, r2, #56 - - /* If less than 8 bytes to copy, - restore registers saved for this loop: optimized poplt { r6, r7 }. */ - itt lt - ldrdlt r6, r7, [sp], #8 - blt 6f /* Go to misaligned copy of less than 8 bytes. */ - -5: - /* Copy 8 bytes at a time. - Use a partial word from the shift queue. */ - mov r6, r5, \shiftleft #\pull - ldrd r4, r5, [r1], #8 - orr r6, r6, r4, \shiftright #\push - mov r7, r4, \shiftleft #\pull - orr r7, r7, r5, \shiftright #\push - strd r6, r7, [r0], #8 - - subs r2, r2, #8 - bge 5b /* If there is more to copy. */ - - /* Restore registers saved for this loop: optimized pop { r6, r7 }. */ - ldrd r6, r7, [sp], #8 - -6: - /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0) - and they are misaligned. */ - - /* Check if there is more to copy. */ - cmn r2, #8 - beq return - - /* Check if there is less than 4 bytes to copy. */ - cmn r2, #4 - - itt lt - /* Restore src offset from word-align. */ - sublt r1, r1, #(\push / 8) - blt copy_less_than_4 - - /* Use a partial word from the shift queue. */ - mov r3, r5, \shiftleft #\pull - /* Load a word from src, but without writeback - (this word is not fully written to dst). */ - ldr r5, [r1] - - /* Restore src offset from word-align. */ - add r1, r1, #(\pull / 8) - - /* Shift bytes to create one dst word and store it. */ - orr r3, r3, r5, \shiftright #\push - str r3, [r0], #4 - - /* Use single byte copying of the remaining bytes. */ - b copy_less_than_4 - - .endm - -#endif /* not __ARM_FEATURE_UNALIGNED */ - -dst_not_word_aligned: - - /* Get here when dst is not aligned and ip has the last 2 bits of dst, - i.e., ip is the offset of dst from word. - The number of bytes that remains to copy is r2 + 4, - i.e., there are at least 4 bytes to copy. - Write a partial word (0 to 3 bytes), such that dst becomes - word-aligned. */ - - /* If dst is at ip bytes offset from a word (with 0 < ip < 4), - then there are (4 - ip) bytes to fill up to align dst to the next - word. */ - rsb ip, ip, #4 /* ip = #4 - ip. */ - cmp ip, #2 - - /* Copy byte by byte with conditionals. */ - itt gt - ldrbgt r3, [r1], #1 - strbgt r3, [r0], #1 - - itt ge - ldrbge r4, [r1], #1 - strbge r4, [r0], #1 - - ldrb lr, [r1], #1 - strb lr, [r0], #1 - - /* Update the count. - ip holds the number of bytes we have just copied. */ - subs r2, r2, ip /* r2 = r2 - ip. */ - blt copy_less_than_4 /* If r2 < ip. */ - - /* Get here if there are more than 4 bytes to copy. - Check if src is aligned. If beforehand src and dst were not word - aligned but congruent (same offset), then now they are both - word-aligned, and we can copy the rest efficiently (without - shifting). */ - ands ip, r1, #3 /* ip = last 2 bits of src. */ - beq word_aligned /* If r1 is word-aligned. */ - -src_not_word_aligned: - /* Get here when src is not word-aligned, but dst is word-aligned. - The number of bytes that remains to copy is r2+4. */ - -#ifdef __ARM_FEATURE_UNALIGNED - /* Copy word by word using LDR when alignment can be done in hardware, - i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ - subs r2, r2, #60 - blt 8f - -7: - /* Copy 64 bytes in every loop iteration. */ - .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 - ldr r3, [r1, \offset] - str r3, [r0, \offset] - .endr - - add r0, r0, #64 - add r1, r1, #64 - subs r2, r2, #64 - bge 7b - -8: - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. - Check if there is more than 3 bytes to copy. */ - adds r2, r2, #60 - blt copy_less_than_4 - -9: - /* Get here if there is less than 64 but at least 4 bytes to copy, - where the number of bytes to copy is r2+4. */ - ldr r3, [r1], #4 - str r3, [r0], #4 - subs r2, r2, #4 - bge 9b - - b copy_less_than_4 - -#else /* not __ARM_FEATURE_UNALIGNED */ - - /* ip has last 2 bits of src, - i.e., ip is the offset of src from word, and ip > 0. - Compute shifts needed to copy from src to dst. */ - cmp ip, #2 - beq miscopy_16_16 /* If ip == 2. */ - bge miscopy_24_8 /* If ip == 3. */ - - /* Get here if ip == 1. */ - - /* Endian independent macros for shifting bytes within registers. */ - -#ifndef __ARMEB__ -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl -#else /* not __ARMEB__ */ -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr -#endif /* not __ARMEB__ */ - -#endif /* not __ARM_FEATURE_UNALIGNED */ + .size memcpy, . - memcpy #endif /* memcpy */