strlen-armv7.S: Import latest strlen cortex-strings code.

Import the latest version of strlen from the Linaro cortex-strings
package. This version is faster across a variety of block size and
alignments on ARMv7.

newlib/ChangeLog:

2013-06-21  Will Newton  <will.newton@linaro.org>

	* libc/machine/arm/strlen-armv7.S: Import latest strlen
	code from Linaro cortex-strings.
This commit is contained in:
Will Newton 2013-06-21 09:10:37 +00:00
parent a1a7a74e6b
commit c8af057907
2 changed files with 112 additions and 77 deletions

View File

@ -1,3 +1,8 @@
2013-06-21 Will Newton <will.newton@linaro.org>
* libc/machine/arm/strlen-armv7.S: Import latest strlen
code from Linaro cortex-strings.
2013-06-21 Will Newton <will.newton@linaro.org>
* MAINTAINERS: Add Will Newton to Write After Approval.

View File

@ -1,4 +1,4 @@
/* Copyright (c) 2010-2011, Linaro Limited
/* Copyright (c) 2010-2011,2013 Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -28,100 +28,130 @@
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Written by Dave Gilbert <david.gilbert@linaro.org>
This strlen routine is optimised on a Cortex-A9 and should work on
all ARMv7 processors. This routine is reasonably fast for short
strings, but is probably slower than a simple implementation if all
your strings are very short */
@ 2011-02-08 david.gilbert@linaro.org
@ Extracted from local git 6848613a
@ 2011-10-13 david.gilbert@linaro.org
@ Extracted from cortex-strings bzr rev 63
@ Integrate to newlib, flip to ldrd
@ Pull in Endian macro from my memchr
Assumes:
ARMv6T2, AArch32
*/
#include "arm_asm.h"
@ NOTE: This ifdef MUST match the ones in arm/strlen.c
@ We fallback to the one in arm/strlen.c for size optimised or
@ for older arch's
/* NOTE: This ifdef MUST match the ones in arm/strlen.c
We fallback to the one in arm/strlen.c for size optimised or
for older architectures. */
#if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
!(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
(defined (__thumb__) && !defined (__thumb2__)))
@ this lets us check a flag in a 00/ff byte easily in either endianness
.macro def_fn f p2align=0
.text
.p2align \p2align
.global \f
.type \f, %function
\f:
.endm
#ifdef __ARMEB__
#define CHARTSTMASK(c) 1<<(31-(c*8))
#define S2LO lsl
#define S2HI lsr
#else
#define CHARTSTMASK(c) 1<<(c*8)
#define S2LO lsr
#define S2HI lsl
#endif
@------------------------------------------------------------------------------
/* This code requires Thumb. */
.thumb
.syntax unified
.arch armv7-a
.thumb_func
.align 2
.p2align 4,,15
.global strlen
.type strlen,%function
strlen:
@ r0 = string
@ returns count of bytes in string not including terminator
mov r1, r0
push { r4,r6 }
mvns r6, #0 @ all F
movs r4, #0
tst r0, #7
beq 2f
/* Parameters and result. */
#define srcin r0
#define result r0
1:
ldrb r2, [r1], #1
tst r1, #7 @ Hit alignment yet?
cbz r2, 10f @ Exit if we found the 0
bne 1b
/* Internal variables. */
#define src r1
#define data1a r2
#define data1b r3
#define const_m1 r12
#define const_0 r4
#define tmp1 r4 /* Overlaps const_0 */
#define tmp2 r5
@ So we're now aligned
2:
ldrd r2,r3,[r1],#8
uadd8 r2, r2, r6 @ Par add 0xff - sets the GE bits for bytes!=0
sel r2, r4, r6 @ bytes are 00 for none-00 bytes,
@ or ff for 00 bytes - NOTE INVERSION
uadd8 r3, r3, r6 @ Par add 0xff - sets the GE bits for bytes!=0
sel r3, r2, r6 @ chained...bytes are 00 for none-00 bytes,
@ or ff for 00 bytes - NOTE INVERSION
cmp r3, #0
beq 2b
def_fn strlen p2align=6
pld [srcin, #0]
strd r4, r5, [sp, #-8]!
bic src, srcin, #7
mvn const_m1, #0
ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
pld [src, #32]
bne.w .Lmisaligned8
mov const_0, #0
mov result, #-8
.Lloop_aligned:
/* Bytes 0-7. */
ldrd data1a, data1b, [src]
pld [src, #64]
add result, result, #8
.Lstart_realigned:
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
strlenendtmp:
@ One (or more) of the bytes we loaded was 0 - but which one?
@ r2 has the mask corresponding to the first loaded word
@ r3 has a combined mask of the two words - but if r2 was all-non 0
@ then it's just the 2nd words
cmp r2, #0
itte eq
moveq r2, r3 @ the end is in the 2nd word
subeq r1,r1,#3
subne r1,r1,#7
/* Bytes 8-15. */
ldrd data1a, data1b, [src, #8]
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
@ r1 currently points to the 2nd byte of the word containing the 0
tst r2, # CHARTSTMASK(0) @ 1st character
bne 10f
adds r1,r1,#1
tst r2, # CHARTSTMASK(1) @ 2nd character
ittt eq
addeq r1,r1,#1
tsteq r2, # (3<<15) @ 2nd & 3rd character
@ If not the 3rd must be the last one
addeq r1,r1,#1
/* Bytes 16-23. */
ldrd data1a, data1b, [src, #16]
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cbnz data1b, .Lnull_found
10:
@ r0 is still at the beginning, r1 is pointing 1 byte after the nul
sub r0, r1, r0
subs r0, r0, #1
pop { r4, r6 }
/* Bytes 24-31. */
ldrd data1a, data1b, [src, #24]
add src, src, #32
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
add result, result, #8
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cmp data1b, #0
beq .Lloop_aligned
.Lnull_found:
cmp data1a, #0
itt eq
addeq result, result, #4
moveq data1a, data1b
#ifndef __ARMEB__
rev data1a, data1a
#endif
clz data1a, data1a
ldrd r4, r5, [sp], #8
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
bx lr
.Lmisaligned8:
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
lsl tmp2, tmp2, #3 /* Bytes -> bits. */
tst tmp1, #4
pld [src, #64]
S2HI tmp2, const_m1, tmp2
orn data1a, data1a, tmp2
itt ne
ornne data1b, data1b, tmp2
movne data1a, const_m1
mov const_0, #0
b .Lstart_realigned
.size strlen, . - strlen
#endif