diff --git a/newlib/ChangeLog b/newlib/ChangeLog
index 0bfad1c01..c389318f9 100644
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,10 @@
+2013-04-12  Will Newton  <will.newton@linaro.org>
+
+	* libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned
+	access is not enabled.
+	* libc/machine/arm/memcpy.S: Faster memcpy implementation for
+	Cortex A15 cores using NEON and VFP if available.
+
 2013-04-12  Bin Cheng  <bin.cheng@arm.com>
 
 	* acconfig.h (_WCHAR_ORIENT): Undef
diff --git a/newlib/libc/machine/arm/memcpy-stub.c b/newlib/libc/machine/arm/memcpy-stub.c
index 536b869cc..513631a9f 100644
--- a/newlib/libc/machine/arm/memcpy-stub.c
+++ b/newlib/libc/machine/arm/memcpy-stub.c
@@ -29,7 +29,7 @@
 /* The sole purpose of this file is to include the plain memcpy provided in newlib.  
    An optimized version of memcpy is provided in the assembly file memcpy.S in this directory. */
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
-     (!(defined (__ARM_ARCH_7A__))))
+     (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED))))
 
 #include "../../string/memcpy.c"
 
diff --git a/newlib/libc/machine/arm/memcpy.S b/newlib/libc/machine/arm/memcpy.S
index e408ed0e0..bc54bb3f5 100644
--- a/newlib/libc/machine/arm/memcpy.S
+++ b/newlib/libc/machine/arm/memcpy.S
@@ -1,423 +1,625 @@
-/*
- * Copyright (c) 2011 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* Copyright (c) 2013, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
+   of VFP or NEON when built with the appropriate flags.
+
+   Assumptions:
+
+    ARMv6 (ARMv7-a if using Neon)
+    ARM state
+    Unaligned accesses
+    LDRD/STRD support unaligned word accesses
+
  */
 
 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
-     (!(defined (__ARM_ARCH_7A__))))
+     (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED))))
 
         /* Do nothing here. See memcpy-stub.c in the same directory. */
 
 #else
-	/* Prototype: void *memcpy (void *dst, const void *src, size_t count).  */
 
-        /* Use the version of memcpy implemented using LDRD and STRD.
-           This version is tuned for Cortex-A15.
-           This might not be the best for other ARMv7-A CPUs,
-           but there is no predefine to distinguish between
-           different CPUs in the same architecture,
-           and this version is better than the plain memcpy provided in newlib.
+	.syntax unified
+	/* This implementation requires ARM state.  */
+	.arm
 
-           Therefore, we use this version for all ARMv7-A CPUS.  */
+#ifdef __ARM_NEON__
 
-        /* To make the same code compile for both ARM and Thumb instruction
-	   sets, switch to unified syntax at the beginning of this function.
-           However, by using the same code, we may be missing optimization
-	   opportunities.  For instance, in LDRD/STRD instructions, the first
-	   destination register must be even and the second consecutive in
-	   ARM state, but not in Thumb state.  */
+	.fpu	neon
+	.arch	armv7-a
+# define FRAME_SIZE	4
+# define USE_VFP
+# define USE_NEON
 
-        .syntax         unified
+#elif !defined (__SOFTFP__)
+
+	.arch	armv6
+	.fpu	vfpv2
+# define FRAME_SIZE	32
+# define USE_VFP
+
+#else
+	.arch	armv6
+# define FRAME_SIZE    32
 
-#if defined (__thumb__)
-        .thumb
-        .thumb_func
 #endif
 
-        .global memcpy
-        .type   memcpy, %function
-memcpy:
+/* Old versions of GAS incorrectly implement the NEON align semantics.  */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
+#else
+#define ALIGN(addr, align) addr:align
+#endif
 
-       /* Assumes that n >= 0, and dst, src are valid pointers.
-          If there is at least 8 bytes to copy, use LDRD/STRD.
-          If src and dst are misaligned with different offsets,
-          first copy byte by byte until dst is aligned,
-          and then copy using LDRD/STRD and shift if needed.
-          When less than 8 left, copy a word and then byte by byte.  */
+#define PC_OFFSET	8	/* PC pipeline compensation.  */
+#define INSN_SIZE	4
 
-       /* Save registers (r0 holds the return value):
-          optimized push {r0, r4, r5, lr}.
-          To try and improve performance, stack layout changed,
-          i.e., not keeping the stack looking like users expect
-          (highest numbered register at highest address).  */
-        push {r0, lr}
-        strd r4, r5, [sp, #-8]!
+/* Call parameters.  */
+#define dstin	r0
+#define src	r1
+#define count	r2
 
-       /* TODO: Add debug frame directives.
-          We don't need exception unwind directives, because the code below
-	  does not throw any exceptions and does not call any other functions.
-          Generally, newlib functions like this lack debug information for
-	  assembler source.  */
+/* Locals.  */
+#define tmp1	r3
+#define dst	ip
+#define tmp2	r10
 
-        /* Get copying of tiny blocks out of the way first.  */
-        /* Is there at least 4 bytes to copy?  */
-        subs    r2, r2, #4
-        blt     copy_less_than_4                 /* If n < 4.  */
+#ifndef USE_NEON
+/* For bulk copies using GP registers.  */
+#define	A_l	r2		/* Call-clobbered.  */
+#define	A_h	r3		/* Call-clobbered.  */
+#define	B_l	r4
+#define	B_h	r5
+#define	C_l	r6
+#define	C_h	r7
+#define	D_l	r8
+#define	D_h	r9
+#endif
 
-        /* Check word alignment.  */
-        ands    ip, r0, #3                       /* ip = last 2 bits of dst.  */
-        bne     dst_not_word_aligned             /* If dst is not word-aligned.  */
+/* Number of lines ahead to pre-fetch data.  If you change this the code
+   below will need adjustment to compensate.  */
 
-        /* Get here if dst is word-aligned.  */
-        ands    ip, r1, #3                      /* ip = last 2 bits of src.  */
-        bne     src_not_word_aligned            /* If src is not word-aligned.  */
-word_aligned:
-        /* Get here if source and dst both are word-aligned.
-           The number of bytes remaining to copy is r2+4.  */
+#define prefetch_lines	5
 
-        /* Is there is at least 64 bytes to copy?  */
-        subs    r2, r2, #60
-        blt     copy_less_than_64                /* If r2 + 4 < 64.  */
+#ifdef USE_VFP
+	.macro	cpy_line_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
 
-        /* First, align the destination buffer to 8-bytes,
-           to make sure double loads and stores don't cross cache line boundary,
-           as they are then more expensive even if the data is in the cache
-           (require two load/store issue cycles instead of one).
-           If only one of the buffers is not 8-bytes aligned,
-           then it's more important to align dst than src,
-           because there is more penalty for stores
-           than loads that cross cacheline boundary.
-           This check and realignment are only worth doing
-           if there is a lot to copy.  */
+	.macro	cpy_tail_vfp vreg, base
+	vstr	\vreg, [dst, #\base]
+	vldr	\vreg, [src, #\base]
+	vstr	d0, [dst, #\base + 8]
+	vldr	d0, [src, #\base + 8]
+	vstr	d1, [dst, #\base + 16]
+	vldr	d1, [src, #\base + 16]
+	vstr	d2, [dst, #\base + 24]
+	vldr	d2, [src, #\base + 24]
+	vstr	\vreg, [dst, #\base + 32]
+	vstr	d0, [dst, #\base + 40]
+	vldr	d0, [src, #\base + 40]
+	vstr	d1, [dst, #\base + 48]
+	vldr	d1, [src, #\base + 48]
+	vstr	d2, [dst, #\base + 56]
+	vldr	d2, [src, #\base + 56]
+	.endm
+#endif
 
-        /* Get here if dst is word aligned,
-           i.e., the 2 least significant bits are 0.
-           If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
-           then copy 1 word (4 bytes).  */
-        ands    r3, r0, #4
-        beq     11f                  /* If dst already two-word aligned.  */
-        ldr     r3, [r1], #4
-        str     r3, [r0], #4
-        subs    r2, r2, #4
-        blt     copy_less_than_64
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
 
-11:
-        /* TODO: Align to cacheline (useful for PLD optimization).  */
+def_fn memcpy p2align=6
+
+	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
+	cmp	count, #64
+	bge	.Lcpy_not_short
+	/* Deal with small copies quickly by dropping straight into the
+	   exit block.  */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+	and	tmp1, count, #0x38
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	vld1.8	{d0}, [src]!	/* 14 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 12 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 10 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 8 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 6 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 4 words to go.  */
+	vst1.8	{d0}, [dst]!
+	vld1.8	{d0}, [src]!	/* 2 words to go.  */
+	vst1.8	{d0}, [dst]!
+
+	tst	count, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+#else
+	/* Copy up to 15 full words of data.  May not be aligned.  */
+	/* Cannot use VFP for unaligned data.  */
+	and	tmp1, count, #0x3c
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+	/* Jump directly into the sequence below at the correct offset.  */
+	add	pc, pc, tmp1, lsl #1
+
+	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
+	str	tmp1, [dst, #-60]
+
+	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
+	str	tmp1, [dst, #-56]
+	ldr	tmp1, [src, #-52]
+	str	tmp1, [dst, #-52]
+
+	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
+	str	tmp1, [dst, #-48]
+	ldr	tmp1, [src, #-44]
+	str	tmp1, [dst, #-44]
+
+	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
+	str	tmp1, [dst, #-40]
+	ldr	tmp1, [src, #-36]
+	str	tmp1, [dst, #-36]
+
+	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
+	str	tmp1, [dst, #-32]
+	ldr	tmp1, [src, #-28]
+	str	tmp1, [dst, #-28]
+
+	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
+	str	tmp1, [dst, #-24]
+	ldr	tmp1, [src, #-20]
+	str	tmp1, [dst, #-20]
+
+	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
+	str	tmp1, [dst, #-16]
+	ldr	tmp1, [src, #-12]
+	str	tmp1, [dst, #-12]
+
+	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
+	str	tmp1, [dst, #-8]
+	ldr	tmp1, [src, #-4]
+	str	tmp1, [dst, #-4]
+#endif
+
+	lsls	count, count, #31
+	ldrhcs	tmp1, [src], #2
+	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
+	strhcs	tmp1, [dst], #2
+	strbne	src, [dst]
+	bx	lr
+
+.Lcpy_not_short:
+	/* At least 64 bytes to copy, but don't know the alignment yet.  */
+	str	tmp2, [sp, #-FRAME_SIZE]!
+	and	tmp2, src, #3
+	and	tmp1, dst, #3
+	cmp	tmp1, tmp2
+	bne	.Lcpy_notaligned
+
+#ifdef USE_VFP
+	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
+	   that the FP pipeline is much better at streaming loads and
+	   stores.  This is outside the critical loop.  */
+	vmov.f32	s0, s0
+#endif
+
+	/* SRC and DST have the same mutual 32-bit alignment, but we may
+	   still need to pre-copy some bytes to get to natural alignment.
+	   We bring DST into full 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src], #1
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst], #1
 
-        /* Every loop iteration copies 64 bytes.  */
 1:
-        .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
-        ldrd    r4, r5, [r1, \offset]
-        strd    r4, r5, [r0, \offset]
-        .endr
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	blt	.Ltail63aligned
 
-        add     r0, r0, #64
-        add     r1, r1, #64
-        subs    r2, r2, #64
-        bge     1b                            /* If there is more to copy.  */
+	cmp	tmp2, #512
+	bge	.Lcpy_body_long
 
-copy_less_than_64:
+.Lcpy_body_medium:			/* Count in tmp2.  */
+#ifdef USE_VFP
+1:
+	vldr	d0, [src, #0]
+	subs	tmp2, tmp2, #64
+	vldr	d1, [src, #8]
+	vstr	d0, [dst, #0]
+	vldr	d0, [src, #16]
+	vstr	d1, [dst, #8]
+	vldr	d1, [src, #24]
+	vstr	d0, [dst, #16]
+	vldr	d0, [src, #32]
+	vstr	d1, [dst, #24]
+	vldr	d1, [src, #40]
+	vstr	d0, [dst, #32]
+	vldr	d0, [src, #48]
+	vstr	d1, [dst, #40]
+	vldr	d1, [src, #56]
+	vstr	d0, [dst, #48]
+	add	src, src, #64
+	vstr	d1, [dst, #56]
+	add	dst, dst, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	beq	.Ldone
 
-        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
-           Restore the count if there is more than 7 bytes to copy.  */
-        adds    r2, r2, #56
-        blt     copy_less_than_8
+.Ltail63aligned:			/* Count in tmp2.  */
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+
+	vldr	d0, [src, #-56]	/* 14 words to go.  */
+	vstr	d0, [dst, #-56]
+	vldr	d0, [src, #-48]	/* 12 words to go.  */
+	vstr	d0, [dst, #-48]
+	vldr	d0, [src, #-40]	/* 10 words to go.  */
+	vstr	d0, [dst, #-40]
+	vldr	d0, [src, #-32]	/* 8 words to go.  */
+	vstr	d0, [dst, #-32]
+	vldr	d0, [src, #-24]	/* 6 words to go.  */
+	vstr	d0, [dst, #-24]
+	vldr	d0, [src, #-16]	/* 4 words to go.  */
+	vstr	d0, [dst, #-16]
+	vldr	d0, [src, #-8]	/* 2 words to go.  */
+	vstr	d0, [dst, #-8]
+#else
+	sub	src, src, #8
+	sub	dst, dst, #8
+1:
+	ldrd	A_l, A_h, [src, #8]
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #16]
+	strd	A_l, A_h, [dst, #16]
+	ldrd	A_l, A_h, [src, #24]
+	strd	A_l, A_h, [dst, #24]
+	ldrd	A_l, A_h, [src, #32]
+	strd	A_l, A_h, [dst, #32]
+	ldrd	A_l, A_h, [src, #40]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #48]
+	strd	A_l, A_h, [dst, #48]
+	ldrd	A_l, A_h, [src, #56]
+	strd	A_l, A_h, [dst, #56]
+	ldrd	A_l, A_h, [src, #64]!
+	strd	A_l, A_h, [dst, #64]!
+	subs	tmp2, tmp2, #64
+	bge	1b
+	tst	tmp2, #0x3f
+	bne	1f
+	ldr	tmp2,[sp], #FRAME_SIZE
+	bx	lr
+1:
+	add	src, src, #8
+	add	dst, dst, #8
+
+.Ltail63aligned:			/* Count in tmp2.  */
+	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
+	   we know that the src and dest are 32-bit aligned so we can use
+	   LDRD/STRD to improve efficiency.  */
+	/* TMP2 is now negative, but we don't care about that.  The bottom
+	   six bits still tell us how many bytes are left to copy.  */
+
+	and	tmp1, tmp2, #0x38
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+	add	pc, pc, tmp1
+	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
+	strd	A_l, A_h, [dst, #-56]
+	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
+	strd	A_l, A_h, [dst, #-48]
+	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
+	strd	A_l, A_h, [dst, #-40]
+	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
+	strd	A_l, A_h, [dst, #-32]
+	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
+	strd	A_l, A_h, [dst, #-24]
+	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
+	strd	A_l, A_h, [dst, #-16]
+	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
+	strd	A_l, A_h, [dst, #-8]
+
+#endif
+	tst	tmp2, #4
+	ldrne	tmp1, [src], #4
+	strne	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
+	ldrhcs	tmp1, [src], #2
+	ldrbne	tmp2, [src]
+	strhcs	tmp1, [dst], #2
+	strbne	tmp2, [dst]
+
+.Ldone:
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+
+.Lcpy_body_long:			/* Count in tmp2.  */
+
+	/* Long copy.  We know that there's at least (prefetch_lines * 64)
+	   bytes to go.  */
+#ifdef USE_VFP
+	/* Don't use PLD.  Instead, read some data in advance of the current
+	   copy position into a register.  This should act like a PLD
+	   operation but we won't have to repeat the transfer.  */
+
+	vldr	d3, [src, #0]
+	vldr	d4, [src, #64]
+	vldr	d5, [src, #128]
+	vldr	d6, [src, #192]
+	vldr	d7, [src, #256]
+
+	vldr	d0, [src, #8]
+	vldr	d1, [src, #16]
+	vldr	d2, [src, #24]
+	add	src, src, #32
+
+	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
+	blt	2f
+1:
+	cpy_line_vfp	d3, 0
+	cpy_line_vfp	d4, 64
+	cpy_line_vfp	d5, 128
+	add	dst, dst, #3 * 64
+	add	src, src, #3 * 64
+	cpy_line_vfp	d6, 0
+	cpy_line_vfp	d7, 64
+	add	dst, dst, #2 * 64
+	add	src, src, #2 * 64
+	subs	tmp2, tmp2, #prefetch_lines * 64
+	bge	1b
 
-        /* Copy 8 bytes at a time.  */
 2:
-        ldrd    r4, r5, [r1], #8
-        strd    r4, r5, [r0], #8
-        subs    r2, r2, #8
-        bge     2b                            /* If there is more to copy.  */
+	cpy_tail_vfp	d3, 0
+	cpy_tail_vfp	d4, 64
+	cpy_tail_vfp	d5, 128
+	add	src, src, #3 * 64
+	add	dst, dst, #3 * 64
+	cpy_tail_vfp	d6, 0
+	vstr	d7, [dst, #64]
+	vldr	d7, [src, #64]
+	vstr	d0, [dst, #64 + 8]
+	vldr	d0, [src, #64 + 8]
+	vstr	d1, [dst, #64 + 16]
+	vldr	d1, [src, #64 + 16]
+	vstr	d2, [dst, #64 + 24]
+	vldr	d2, [src, #64 + 24]
+	vstr	d7, [dst, #64 + 32]
+	add	src, src, #96
+	vstr	d0, [dst, #64 + 40]
+	vstr	d1, [dst, #64 + 48]
+	vstr	d2, [dst, #64 + 56]
+	add	dst, dst, #128
+	add	tmp2, tmp2, #prefetch_lines * 64
+	b	.Lcpy_body_medium
+#else
+	/* Long copy.  Use an SMS style loop to maximize the I/O
+	   bandwidth of the core.  We don't have enough spare registers
+	   to synthesise prefetching, so use PLD operations.  */
+	/* Pre-bias src and dst.  */
+	sub	src, src, #8
+	sub	dst, dst, #8
+	pld	[src, #8]
+	pld	[src, #72]
+	subs	tmp2, tmp2, #64
+	pld	[src, #136]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	pld	[src, #200]
+	ldrd	D_l, D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #232]
+	strd	A_l, A_h, [dst, #40]
+	ldrd	A_l, A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldrd	D_l, D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldrd	A_l, A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldrd	B_l, B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldrd	C_l, C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldrd	D_l, D_h, [src, #32]
+	bcs	2b
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #40
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	tst	tmp2, #0x3f
+	bne	.Ltail63aligned
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bx	lr
+#endif
 
-copy_less_than_8:
+.Lcpy_notaligned:
+	pld	[src]
+	pld	[src, #64]
+	/* There's at least 64 bytes to copy, but there is no mutual
+	   alignment.  */
+	/* Bring DST to 64-bit alignment.  */
+	lsls	tmp2, dst, #29
+	pld	[src, #(2 * 64)]
+	beq	1f
+	rsbs	tmp2, tmp2, #0
+	sub	count, count, tmp2, lsr #29
+	ldrmi	tmp1, [src], #4
+	strmi	tmp1, [dst], #4
+	lsls	tmp2, tmp2, #2
+	ldrbne	tmp1, [src], #1
+	ldrhcs	tmp2, [src], #2
+	strbne	tmp1, [dst], #1
+	strhcs	tmp2, [dst], #2
+1:
+	pld	[src, #(3 * 64)]
+	subs	count, count, #64
+	ldrmi	tmp2, [sp], #FRAME_SIZE
+	bmi	.Ltail63unaligned
+	pld	[src, #(4 * 64)]
 
-        /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
-           Check if there is more to copy.  */
-        cmn     r2, #8
-        beq     return                          /* If r2 + 8 == 0.  */
+#ifdef USE_NEON
+	vld1.8	{d0-d3}, [src]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bmi	2f
+1:
+	pld	[src, #(4 * 64)]
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vld1.8	{d0-d3}, [src]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	vld1.8	{d4-d7}, [src]!
+	subs	count, count, #64
+	bpl	1b
+2:
+	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
+	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
+	ands	count, count, #0x3f
+#else
+	/* Use an SMS style loop to maximize the I/O bandwidth.  */
+	sub	src, src, #4
+	sub	dst, dst, #8
+	subs	tmp2, count, #64	/* Use tmp2 for count.  */
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [sp, #8]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [sp, #16]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [sp, #24]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]!
+	b	1f
+	.p2align	6
+2:
+	pld	[src, #(5 * 64) - (32 - 4)]
+	strd	A_l, A_h, [dst, #40]
+	ldr	A_l, [src, #36]
+	ldr	A_h, [src, #40]
+	strd	B_l, B_h, [dst, #48]
+	ldr	B_l, [src, #44]
+	ldr	B_h, [src, #48]
+	strd	C_l, C_h, [dst, #56]
+	ldr	C_l, [src, #52]
+	ldr	C_h, [src, #56]
+	strd	D_l, D_h, [dst, #64]!
+	ldr	D_l, [src, #60]
+	ldr	D_h, [src, #64]!
+	subs	tmp2, tmp2, #64
+1:
+	strd	A_l, A_h, [dst, #8]
+	ldr	A_l, [src, #4]
+	ldr	A_h, [src, #8]
+	strd	B_l, B_h, [dst, #16]
+	ldr	B_l, [src, #12]
+	ldr	B_h, [src, #16]
+	strd	C_l, C_h, [dst, #24]
+	ldr	C_l, [src, #20]
+	ldr	C_h, [src, #24]
+	strd	D_l, D_h, [dst, #32]
+	ldr	D_l, [src, #28]
+	ldr	D_h, [src, #32]
+	bcs	2b
 
-        /* Restore the count if there is more than 3 bytes to copy.  */
-        adds    r2, r2, #4
-        blt     copy_less_than_4
+	/* Save the remaining bytes and restore the callee-saved regs.  */
+	strd	A_l, A_h, [dst, #40]
+	add	src, src, #36
+	strd	B_l, B_h, [dst, #48]
+	ldrd	B_l, B_h, [sp, #8]
+	strd	C_l, C_h, [dst, #56]
+	ldrd	C_l, C_h, [sp, #16]
+	strd	D_l, D_h, [dst, #64]
+	ldrd	D_l, D_h, [sp, #24]
+	add	dst, dst, #72
+	ands	count, tmp2, #0x3f
+#endif
+	ldr	tmp2, [sp], #FRAME_SIZE
+	bne	.Ltail63unaligned
+	bx	lr
 
-        /* Copy 4 bytes.  */
-        ldr     r3, [r1], #4
-        str     r3, [r0], #4
-
-copy_less_than_4:
-        /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
-
-        /* Restore the count, check if there is more to copy.  */
-        adds    r2, r2, #4
-        beq     return                          /* If r2 == 0.  */
-
-        /* Get here with r2 is in {1,2,3}={01,10,11}.  */
-        /* Logical shift left r2, insert 0s, update flags.  */
-        lsls    r2, r2, #31
-
-        /* Copy byte by byte.
-           Condition ne means the last bit of r2 is 0.
-           Condition cs means the second to last bit of r2 is set,
-           i.e., r2 is 1 or 3.  */
-        itt     ne
-        ldrbne  r3, [r1], #1
-        strbne  r3, [r0], #1
-
-        itttt   cs
-        ldrbcs  r4, [r1], #1
-        ldrbcs  r5, [r1]
-        strbcs  r4, [r0], #1
-        strbcs  r5, [r0]
-
-return:
-        /* Restore registers: optimized pop {r0, r4, r5, pc}   */
-        ldrd r4, r5, [sp], #8
-        pop {r0, pc}           /* This is the only return point of memcpy.  */
-
-#ifndef __ARM_FEATURE_UNALIGNED
-
-       /* The following assembly macro implements misaligned copy in software.
-          Assumes that dst is word aligned, src is at offset "pull" bits from
-	  word, push = 32 - pull, and the number of bytes that remain to copy
-	  is r2 + 4, r2 >= 0.  */
-
-       /* In the code below, r2 is the number of bytes that remain to be
-	  written.  The number of bytes read is always larger, because we have
-	  partial words in the shift queue.  */
-
-        .macro  miscopy pull push shiftleft shiftright
-
-        /* Align src to the previous word boundary.  */
-        bic     r1, r1, #3
-
-        /* Initialize the shift queue.  */
-        ldr     r5, [r1], #4                   /* Load a word from source.  */
-
-        subs    r2, r2, #4
-        blt     6f          /* Go to misaligned copy of less than 8 bytes.  */
-
-       /* Get here if there is more than 8 bytes to copy.
-          The number of bytes to copy is r2+8, r2 >= 0.  */
-
-       /* Save registers: push { r6, r7 }.
-          We need additional registers for LDRD and STRD, because in ARM state
-          the first destination register must be even and the second
-	  consecutive.  */
-       strd     r6, r7, [sp, #-8]!
-
-       subs     r2, r2, #56
-       blt      4f         /* Go to misaligned copy of less than 64 bytes.  */
-
-3:
-       /* Get here if there is more than 64 bytes to copy.
-          The number of bytes to copy is r2+64, r2 >= 0.  */
-
-       /* Copy 64 bytes in every iteration.
-          Use a partial word from the shift queue.  */
-        .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
-        mov     r6, r5, \shiftleft #\pull
-        ldrd    r4, r5, [r1, \offset]
-        orr     r6, r6, r4, \shiftright #\push
-        mov     r7, r4, \shiftleft #\pull
-        orr     r7, r7, r5, \shiftright #\push
-        strd    r6, r7, [r0, \offset]
-        .endr
-
-        add     r1, r1, #64
-        add     r0, r0, #64
-        subs    r2, r2, #64
-        bge     3b
-
-4:
-       /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
-	  and they are misaligned.  */
-
-       /* Restore the count if there is more than 7 bytes to copy.  */
-        adds    r2, r2, #56
-
-       /* If less than 8 bytes to copy,
-          restore registers saved for this loop: optimized poplt { r6, r7 }. */
-        itt     lt
-        ldrdlt  r6, r7, [sp], #8
-        blt     6f          /* Go to misaligned copy of less than 8 bytes.  */
-
-5:
-        /* Copy 8 bytes at a time.
-           Use a partial word from the shift queue.  */
-        mov     r6, r5, \shiftleft #\pull
-        ldrd    r4, r5, [r1], #8
-        orr     r6, r6, r4, \shiftright #\push
-        mov     r7, r4, \shiftleft #\pull
-        orr     r7, r7, r5, \shiftright #\push
-        strd    r6, r7, [r0], #8
-
-        subs    r2, r2, #8
-        bge     5b                        /* If there is more to copy.  */
-
-        /* Restore registers saved for this loop: optimized pop { r6, r7 }.  */
-        ldrd    r6, r7, [sp], #8
-
-6:
-        /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
-           and they are misaligned.  */
-
-        /* Check if there is more to copy.  */
-        cmn     r2, #8
-        beq     return
-
-        /* Check if there is less than 4 bytes to copy.  */
-        cmn     r2, #4
-
-        itt     lt
-        /* Restore src offset from word-align.  */
-        sublt   r1, r1, #(\push / 8)
-        blt     copy_less_than_4
-
-        /* Use a partial word from the shift queue.  */
-        mov     r3, r5, \shiftleft #\pull
-        /* Load a word from src, but without writeback
-           (this word is not fully written to dst).  */
-        ldr     r5, [r1]
-
-        /* Restore src offset from word-align.  */
-        add     r1, r1, #(\pull / 8)
-
-        /* Shift bytes to create one dst word and store it.  */
-        orr     r3, r3, r5, \shiftright #\push
-        str     r3, [r0], #4
-
-        /* Use single byte copying of the remaining bytes.  */
-        b       copy_less_than_4
-
-        .endm
-
-#endif /* not __ARM_FEATURE_UNALIGNED  */
-
-dst_not_word_aligned:
-
-       /* Get here when dst is not aligned and ip has the last 2 bits of dst,
-          i.e., ip is the offset of dst from word.
-          The number of bytes that remains to copy is r2 + 4,
-          i.e., there are at least 4 bytes to copy.
-          Write a partial word (0 to 3 bytes), such that dst becomes
-	  word-aligned.  */
-
-       /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
-          then there are (4 - ip) bytes to fill up to align dst to the next
-	  word.  */
-        rsb     ip, ip, #4                        /* ip = #4 - ip.  */
-        cmp     ip, #2
-
-       /* Copy byte by byte with conditionals.  */
-        itt     gt
-        ldrbgt  r3, [r1], #1
-        strbgt  r3, [r0], #1
-
-        itt     ge
-        ldrbge  r4, [r1], #1
-        strbge  r4, [r0], #1
-
-        ldrb    lr, [r1], #1
-        strb    lr, [r0], #1
-
-       /* Update the count.
-          ip holds the number of bytes we have just copied.  */
-        subs    r2, r2, ip                        /* r2 = r2 - ip.  */
-        blt     copy_less_than_4                  /* If r2 < ip.  */
-
-       /* Get here if there are more than 4 bytes to copy.
-          Check if src is aligned.  If beforehand src and dst were not word
-	  aligned but congruent (same offset), then now they are both
-	  word-aligned, and we can copy the rest efficiently (without
-	  shifting).  */
-        ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
-        beq     word_aligned                  /* If r1 is word-aligned.  */
-
-src_not_word_aligned:
-       /* Get here when src is not word-aligned, but dst is word-aligned.
-          The number of bytes that remains to copy is r2+4.  */
-
-#ifdef __ARM_FEATURE_UNALIGNED
-       /* Copy word by word using LDR when alignment can be done in hardware,
-          i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
-        subs    r2, r2, #60
-        blt     8f
-
-7:
-        /* Copy 64 bytes in every loop iteration.  */
-        .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
-        ldr     r3, [r1, \offset]
-        str     r3, [r0, \offset]
-        .endr
-
-        add     r0, r0, #64
-        add     r1, r1, #64
-        subs    r2, r2, #64
-        bge     7b
-
-8:
-        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
-           Check if there is more than 3 bytes to copy.  */
-        adds    r2, r2, #60
-        blt     copy_less_than_4
-
-9:      
-       /* Get here if there is less than 64 but at least 4 bytes to copy,
-          where the number of bytes to copy is r2+4.  */
-        ldr     r3, [r1], #4
-        str     r3, [r0], #4
-        subs    r2, r2, #4
-        bge     9b
-
-        b       copy_less_than_4
-
-#else /* not __ARM_FEATURE_UNALIGNED  */
-
-       /* ip has last 2 bits of src,
-          i.e., ip is the offset of src from word, and ip > 0.
-          Compute shifts needed to copy from src to dst.  */
-        cmp     ip, #2
-        beq     miscopy_16_16             /* If ip == 2.  */
-        bge     miscopy_24_8              /* If ip == 3.  */
-
-        /* Get here if ip == 1.  */
-
-        /* Endian independent macros for shifting bytes within registers.  */
-
-#ifndef __ARMEB__
-miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
-miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
-miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
-#else  /* not __ARMEB__ */
-miscopy_8_24:   miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
-miscopy_16_16:  miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
-miscopy_24_8:   miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
-#endif  /* not __ARMEB__ */
-
-#endif  /* not __ARM_FEATURE_UNALIGNED  */
+	.size	memcpy, . - memcpy
 
 #endif  /* memcpy */