From c8af05790785011377b5cbc57397f3908581b628 Mon Sep 17 00:00:00 2001 From: Will Newton Date: Fri, 21 Jun 2013 09:10:37 +0000 Subject: [PATCH] strlen-armv7.S: Import latest strlen cortex-strings code. Import the latest version of strlen from the Linaro cortex-strings package. This version is faster across a variety of block size and alignments on ARMv7. newlib/ChangeLog: 2013-06-21 Will Newton * libc/machine/arm/strlen-armv7.S: Import latest strlen code from Linaro cortex-strings. --- newlib/ChangeLog | 5 + newlib/libc/machine/arm/strlen-armv7.S | 184 ++++++++++++++----------- 2 files changed, 112 insertions(+), 77 deletions(-) diff --git a/newlib/ChangeLog b/newlib/ChangeLog index 35fd4a080..e4ac48eae 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,8 @@ +2013-06-21 Will Newton + + * libc/machine/arm/strlen-armv7.S: Import latest strlen + code from Linaro cortex-strings. + 2013-06-21 Will Newton * MAINTAINERS: Add Will Newton to Write After Approval. diff --git a/newlib/libc/machine/arm/strlen-armv7.S b/newlib/libc/machine/arm/strlen-armv7.S index d6e2831ff..1aa51c9fb 100644 --- a/newlib/libc/machine/arm/strlen-armv7.S +++ b/newlib/libc/machine/arm/strlen-armv7.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2010-2011, Linaro Limited +/* Copyright (c) 2010-2011,2013 Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -28,100 +28,130 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Written by Dave Gilbert - - This strlen routine is optimised on a Cortex-A9 and should work on - all ARMv7 processors. This routine is reasonably fast for short - strings, but is probably slower than a simple implementation if all - your strings are very short */ - -@ 2011-02-08 david.gilbert@linaro.org -@ Extracted from local git 6848613a -@ 2011-10-13 david.gilbert@linaro.org -@ Extracted from cortex-strings bzr rev 63 -@ Integrate to newlib, flip to ldrd -@ Pull in Endian macro from my memchr + Assumes: + ARMv6T2, AArch32 + */ #include "arm_asm.h" -@ NOTE: This ifdef MUST match the ones in arm/strlen.c -@ We fallback to the one in arm/strlen.c for size optimised or -@ for older arch's +/* NOTE: This ifdef MUST match the ones in arm/strlen.c + We fallback to the one in arm/strlen.c for size optimised or + for older architectures. */ #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \ !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \ (defined (__thumb__) && !defined (__thumb2__))) -@ this lets us check a flag in a 00/ff byte easily in either endianness + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + #ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) +#define S2LO lsl +#define S2HI lsr #else -#define CHARTSTMASK(c) 1<<(c*8) +#define S2LO lsr +#define S2HI lsl #endif -@------------------------------------------------------------------------------ + /* This code requires Thumb. */ + .thumb .syntax unified - .arch armv7-a - .thumb_func - .align 2 - .p2align 4,,15 - .global strlen - .type strlen,%function -strlen: - @ r0 = string - @ returns count of bytes in string not including terminator - mov r1, r0 - push { r4,r6 } - mvns r6, #0 @ all F - movs r4, #0 - tst r0, #7 - beq 2f +/* Parameters and result. */ +#define srcin r0 +#define result r0 -1: - ldrb r2, [r1], #1 - tst r1, #7 @ Hit alignment yet? - cbz r2, 10f @ Exit if we found the 0 - bne 1b +/* Internal variables. */ +#define src r1 +#define data1a r2 +#define data1b r3 +#define const_m1 r12 +#define const_0 r4 +#define tmp1 r4 /* Overlaps const_0 */ +#define tmp2 r5 - @ So we're now aligned -2: - ldrd r2,r3,[r1],#8 - uadd8 r2, r2, r6 @ Par add 0xff - sets the GE bits for bytes!=0 - sel r2, r4, r6 @ bytes are 00 for none-00 bytes, - @ or ff for 00 bytes - NOTE INVERSION - uadd8 r3, r3, r6 @ Par add 0xff - sets the GE bits for bytes!=0 - sel r3, r2, r6 @ chained...bytes are 00 for none-00 bytes, - @ or ff for 00 bytes - NOTE INVERSION - cmp r3, #0 - beq 2b +def_fn strlen p2align=6 + pld [srcin, #0] + strd r4, r5, [sp, #-8]! + bic src, srcin, #7 + mvn const_m1, #0 + ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ + pld [src, #32] + bne.w .Lmisaligned8 + mov const_0, #0 + mov result, #-8 +.Lloop_aligned: + /* Bytes 0-7. */ + ldrd data1a, data1b, [src] + pld [src, #64] + add result, result, #8 +.Lstart_realigned: + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found -strlenendtmp: - @ One (or more) of the bytes we loaded was 0 - but which one? - @ r2 has the mask corresponding to the first loaded word - @ r3 has a combined mask of the two words - but if r2 was all-non 0 - @ then it's just the 2nd words - cmp r2, #0 - itte eq - moveq r2, r3 @ the end is in the 2nd word - subeq r1,r1,#3 - subne r1,r1,#7 + /* Bytes 8-15. */ + ldrd data1a, data1b, [src, #8] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found - @ r1 currently points to the 2nd byte of the word containing the 0 - tst r2, # CHARTSTMASK(0) @ 1st character - bne 10f - adds r1,r1,#1 - tst r2, # CHARTSTMASK(1) @ 2nd character - ittt eq - addeq r1,r1,#1 - tsteq r2, # (3<<15) @ 2nd & 3rd character - @ If not the 3rd must be the last one - addeq r1,r1,#1 + /* Bytes 16-23. */ + ldrd data1a, data1b, [src, #16] + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cbnz data1b, .Lnull_found -10: - @ r0 is still at the beginning, r1 is pointing 1 byte after the nul - sub r0, r1, r0 - subs r0, r0, #1 - pop { r4, r6 } + /* Bytes 24-31. */ + ldrd data1a, data1b, [src, #24] + add src, src, #32 + uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ + add result, result, #8 + sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ + uadd8 data1b, data1b, const_m1 + sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ + cmp data1b, #0 + beq .Lloop_aligned + +.Lnull_found: + cmp data1a, #0 + itt eq + addeq result, result, #4 + moveq data1a, data1b +#ifndef __ARMEB__ + rev data1a, data1a +#endif + clz data1a, data1a + ldrd r4, r5, [sp], #8 + add result, result, data1a, lsr #3 /* Bits -> Bytes. */ bx lr +.Lmisaligned8: + ldrd data1a, data1b, [src] + and tmp2, tmp1, #3 + rsb result, tmp1, #0 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + tst tmp1, #4 + pld [src, #64] + S2HI tmp2, const_m1, tmp2 + orn data1a, data1a, tmp2 + itt ne + ornne data1b, data1b, tmp2 + movne data1a, const_m1 + mov const_0, #0 + b .Lstart_realigned + .size strlen, . - strlen + #endif