From c8af05790785011377b5cbc57397f3908581b628 Mon Sep 17 00:00:00 2001
From: Will Newton <will.newton@linaro.org>
Date: Fri, 21 Jun 2013 09:10:37 +0000
Subject: [PATCH] strlen-armv7.S: Import latest strlen cortex-strings code.

Import the latest version of strlen from the Linaro cortex-strings
package. This version is faster across a variety of block size and
alignments on ARMv7.

newlib/ChangeLog:

2013-06-21  Will Newton  <will.newton@linaro.org>

	* libc/machine/arm/strlen-armv7.S: Import latest strlen
	code from Linaro cortex-strings.
---
 newlib/ChangeLog                       |   5 +
 newlib/libc/machine/arm/strlen-armv7.S | 184 ++++++++++++++-----------
 2 files changed, 112 insertions(+), 77 deletions(-)

diff --git a/newlib/ChangeLog b/newlib/ChangeLog
index 35fd4a080..e4ac48eae 100644
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,8 @@
+2013-06-21  Will Newton  <will.newton@linaro.org>
+
+	* libc/machine/arm/strlen-armv7.S: Import latest strlen
+	code from Linaro cortex-strings.
+
 2013-06-21  Will Newton  <will.newton@linaro.org>
 
 	* MAINTAINERS: Add Will Newton to Write After Approval.
diff --git a/newlib/libc/machine/arm/strlen-armv7.S b/newlib/libc/machine/arm/strlen-armv7.S
index d6e2831ff..1aa51c9fb 100644
--- a/newlib/libc/machine/arm/strlen-armv7.S
+++ b/newlib/libc/machine/arm/strlen-armv7.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Linaro Limited
+/* Copyright (c) 2010-2011,2013 Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -28,100 +28,130 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-   Written by Dave Gilbert <david.gilbert@linaro.org>
-
-   This strlen routine is optimised on a Cortex-A9 and should work on
-   all ARMv7 processors.   This routine is reasonably fast for short
-   strings, but is probably slower than a simple implementation if all
-   your strings are very short */
-
-@ 2011-02-08 david.gilbert@linaro.org
-@    Extracted from local git 6848613a
-@ 2011-10-13 david.gilbert@linaro.org
-@    Extracted from cortex-strings bzr rev 63
-@      Integrate to newlib, flip to ldrd
-@      Pull in Endian macro from my memchr
+   Assumes:
+   ARMv6T2, AArch32
+ */
 
 #include "arm_asm.h"
 
-@ NOTE: This ifdef MUST match the ones in arm/strlen.c
-@ We fallback to the one in arm/strlen.c for size optimised or
-@ for older arch's
+/* NOTE: This ifdef MUST match the ones in arm/strlen.c
+   We fallback to the one in arm/strlen.c for size optimised or
+   for older architectures. */
 #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
     !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
       (defined (__thumb__) && !defined (__thumb2__)))
 
-@ this lets us check a flag in a 00/ff byte easily in either endianness
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
+
 #ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
+#define S2LO		lsl
+#define S2HI		lsr
 #else
-#define CHARTSTMASK(c) 1<<(c*8)
+#define S2LO		lsr
+#define S2HI		lsl
 #endif
 
-@------------------------------------------------------------------------------
+	/* This code requires Thumb.  */
+	.thumb
 	.syntax unified
-	.arch armv7-a
 
-	.thumb_func
-	.align 2
-	.p2align 4,,15
-	.global strlen
-	.type strlen,%function
-strlen:
-	@ r0 = string
-	@ returns count of bytes in string not including terminator
-	mov	r1, r0
-	push	{ r4,r6 }
-	mvns	r6, #0		@ all F
-	movs	r4, #0
-	tst	r0, #7
-	beq	2f
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
 
-1:
-	ldrb	r2, [r1], #1
-	tst	r1, #7		@ Hit alignment yet?
-	cbz	r2, 10f		@ Exit if we found the 0
-	bne	1b
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
 
-	@ So we're now aligned
-2:
-	ldrd    r2,r3,[r1],#8
-	uadd8	r2, r2, r6	@ Par add 0xff - sets the GE bits for bytes!=0
-	sel	r2, r4, r6	@ bytes are 00 for none-00 bytes,
-				@ or ff for 00 bytes - NOTE INVERSION
-	uadd8	r3, r3, r6	@ Par add 0xff - sets the GE bits for bytes!=0
-	sel	r3, r2, r6	@ chained...bytes are 00 for none-00 bytes,
-				@ or ff for 00 bytes - NOTE INVERSION
-	cmp	r3, #0
-	beq	2b
+def_fn	strlen p2align=6
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	.Lmisaligned8
+	mov	const_0, #0
+	mov	result, #-8
+.Lloop_aligned:
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+.Lstart_realigned:
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
 
-strlenendtmp:
-	@ One (or more) of the bytes we loaded was 0 - but which one?
-	@ r2 has the mask corresponding to the first loaded word
-	@ r3 has a combined mask of the two words - but if r2 was all-non 0 
-	@ then it's just the 2nd words
-	cmp	r2, #0
-	itte	eq
-	moveq	r2, r3		@ the end is in the 2nd word
-	subeq	r1,r1,#3
-	subne	r1,r1,#7
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
 
-	@ r1 currently points to the 2nd byte of the word containing the 0
-	tst	r2, # CHARTSTMASK(0)	@ 1st character
-	bne	10f
-	adds	r1,r1,#1
-	tst	r2, # CHARTSTMASK(1)	@ 2nd character
-	ittt	eq
-	addeq	r1,r1,#1
-	tsteq	r2, # (3<<15)	@ 2nd & 3rd character
-	@ If not the 3rd must be the last one
-	addeq	r1,r1,#1
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
 
-10:
-	@ r0 is still at the beginning, r1 is pointing 1 byte after the nul
-	sub	r0, r1, r0
-	subs	r0, r0, #1
-	pop	{ r4, r6 }
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	.Lloop_aligned
+
+.Lnull_found:
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+#endif
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
 	bx	lr
 
+.Lmisaligned8:
+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	.Lstart_realigned
+	.size	strlen, . - strlen
+
 #endif