* libc/machine/aarch64/strcpy.S: Improve handling of short strings.

This commit is contained in:
Richard Earnshaw 2014-12-16 15:48:58 +00:00
parent 32c96ddd14
commit 52edca9f86
2 changed files with 211 additions and 135 deletions

View File

@ -1,3 +1,7 @@
2014-12-16 Richard Earnshaw <rearnsha@arm.com>
* libc/machine/aarch64/strcpy.S: Improve handling of short strings.
2014-12-16 Jon Beniston <jon@beniston.com>
* libc/include/stdlib.h (__itoa): Declare prototype.

View File

@ -1,8 +1,8 @@
/*
strcpy - copy a string.
Copyright (c) 2013, 2014, ARM Limited
All rights Reserved.
Copyright (c) 2013, 2014 ARM Ltd.
All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
@ -33,25 +33,36 @@
/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
/* To test the page crossing code path more thoroughly, compile with
-DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
entry path. This option is not intended for production use. */
/* Arguments and results. */
#define dstin x0
#define src x1
#define srcin x1
/* Locals and temporaries. */
#define dst x2
#define data1 x3
#define data1w w3
#define data2 x4
#define has_nul1 x5
#define has_nul2 x6
#define tmp1 x7
#define tmp2 x8
#define tmp3 x9
#define tmp4 x10
#define zeroones x11
#define src x2
#define dst x3
#define data1 x4
#define data1w w4
#define data2 x5
#define data2w w5
#define has_nul1 x6
#define has_nul2 x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define tmp4 x11
#define zeroones x12
#define data1a x13
#define data2a x14
#define pos x15
#define len x16
#define to_align x17
.macro def_fn f p2align=0
.text
@ -61,27 +72,123 @@
\f:
.endm
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
/* Start of critial section -- keep to one 64Byte cache line. */
/* AArch64 systems have a minimum page size of 4k. We can do a quick
page size check for crossing this boundary on entry and if we
do not, then we can short-circuit much of the entry code. We
expect early page-crossing strings to be rare (probability of
16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
predictable, even with random strings.
We don't bother checking for larger page sizes, the cost of setting
up the correct page size is just not worth the extra gain from
a small reduction in the cases taking the slow path. Note that
we only care about whether the first fetch, which may be
misaligned, crosses a page boundary - after that we move to aligned
fetches for the remainder of the string. */
#define MIN_PAGE_P2 12
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
def_fn strcpy p2align=6
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
multiple byte copies and further means that by the time we
reach the bulk copy loop we know we can always use DWord
accesses. We expect strcpy to rarely be called repeatedly
with the same source string, so branch prediction is likely to
always be difficult - we mitigate against this by preferring
conditional select operations over branches whenever this is
feasible. */
add tmp2, srcin, #15
mov zeroones, #REP8_01
and to_align, srcin, #15
eor tmp2, tmp2, srcin
mov dst, dstin
ands tmp1, src, #15
b.ne .Lmisaligned
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
neg tmp1, to_align
#ifdef STRCPY_TEST_PAGE_CROSS
b .Lpage_cross
#else
/* The first fetch will straddle a (possible) page boundary iff
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
aligned string will never fail the page align check, so will
always take the fast path. */
tbnz tmp2, #MIN_PAGE_P2, .Lpage_cross
#endif
ldp data1, data2, [srcin]
add src, srcin, #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.ne .Learly_end_found
stp data1, data2, [dst], #16
sub src, src, to_align
sub dst, dst, to_align
b .Lentry_no_page_cross
.Lpage_cross:
bic src, srcin, #15
/* Start by loading two words at [srcin & ~15], then forcing the
bytes that precede srcin to 0xff. This means they never look
like termination bytes. */
ldp data1, data2, [src], #16
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
tst to_align, #7
csetm tmp2, ne
#ifdef __AARCH64EB__
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#else
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
#endif
orr data1, data1, tmp2
orr data2a, data2, tmp2
cmp to_align, #8
csinv data1, data1, xzr, lt
csel data2, data2, data2a, lt
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.ne .Learly_end_found
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.ne .Learly_end_found
/* We've now checked between 16 and 32 bytes, but not found a null,
so we can safely start bulk copying. Start by refetching the
first 16 bytes of the real string; we know this can't trap now. */
ldp data1a, data2a, [srcin]
stp data1a, data2a, [dst], #16
sub dst, dst, to_align
/* Everything is now set up, so we can just fall into the bulk
copy loop. */
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
b .Lfirst_pass
.Lmain_loop:
stp data1, data2, [dst], #16
.Lstartloop_fast:
.Lentry_no_page_cross:
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@ -91,134 +198,99 @@ def_fn strcpy p2align=6
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq .Lmain_loop
/* End of critical section -- keep to one 64Byte cache line. */
cbnz has_nul1, .Lnul_in_data1_fast
.Lnul_in_data2_fast:
str data1, [dst], #8
.Lnul_in_data2_fast_after_d1:
/* For a NUL in data2, we always know that we've moved at least 8
bytes, so no need for a slow path. */
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
trailing NUL, then (re)copy the 16 bytes leading up to that. */
cmp has_nul1, #0
#ifdef __AARCH64EB__
/* For big-endian only, carry propagation means we can't trust
the MSB of the syndrome value calculated above (the byte
sequence 01 00 will generate a syndrome of 80 80 rather than
00 80). We get around this by byte-swapping the data and
re-calculating. */
rev data2, data2
sub tmp1, data2, zeroones
orr tmp2, data2, #REP8_7f
bic has_nul2, tmp1, tmp2
#endif
rev has_nul2, has_nul2
sub src, src, #(8+7)
clz has_nul2, has_nul2
lsr has_nul2, has_nul2, #3 /* Bits to bytes. */
sub dst, dst, #7
ldr data2, [src, has_nul2]
str data2, [dst, has_nul2]
ret
.Lnul_in_data1_fast:
/* Since we know we've already copied at least 8 bytes, we can
safely handle the tail with one misaligned dword move. To do this
we calculate the location of the trailing NUL byte and go seven
bytes back from that. */
#ifdef __AARCH64EB__
/* For big-endian only, carry propagation means we can't trust
the MSB of the syndrome value calculated above (the byte
sequence 01 00 will generate a syndrome of 80 80 rather than
00 80). We get around this by byte-swapping the data and
re-calculating. */
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, ne
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, ne
#endif
rev has_nul1, has_nul1
sub src, src, #(16+7)
clz has_nul1, has_nul1
lsr has_nul1, has_nul1, #3 /* Bits to bytes. */
sub dst, dst, #7
ldr data1, [src, has_nul1]
str data1, [dst, has_nul1]
clz pos, has_nul1
add tmp1, pos, #72
add pos, pos, #8
csel pos, pos, tmp1, ne
add src, src, pos, lsr #3
add dst, dst, pos, lsr #3
ldp data1, data2, [src, #-32]
stp data1, data2, [dst, #-16]
ret
.Lfirst_pass:
ldp data1, data2, [src], #16
/* The string is short (<32 bytes). We don't know exactly how
short though, yet. Work out the exact length so that we can
quickly select the optimal copy strategy. */
.Learly_end_found:
cmp has_nul1, #0
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, ne
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq .Lmain_loop
cbz has_nul1, .Lnul_in_data2_fast
.Lnul_in_data1:
/* Slow path. We can't be sure we've moved at least 8 bytes, so
fall back to a slow byte-by byte store of the bits already
loaded.
The worst case when coming through this path is that we've had
to copy seven individual bytes to get to alignment and we then
have to copy another seven (eight for big-endian) again here.
We could try to detect that case (and any case where more than
eight bytes have to be copied), but it really doesn't seem
worth it. */
#ifdef __AARCH64EB__
rev data1, data1
#else
/* On little-endian, we can easily check if the NULL byte was
in the last byte of the Dword. For big-endian we'd have to
recalculate the syndrome, which is unlikely to be worth it. */
lsl has_nul1, has_nul1, #8
cbnz has_nul1, 1f
str data1, [dst]
ret
csel has_nul1, has_nul1, has_nul2, ne
#endif
1:
strb data1w, [dst], #1
tst data1, #0xff
lsr data1, data1, #8
b.ne 1b
.Ldone:
rev has_nul1, has_nul1
sub tmp1, src, #7
sub src, src, #15
clz pos, has_nul1
csel src, src, tmp1, ne
sub dst, dstin, srcin
add src, src, pos, lsr #3 /* Bits to bytes. */
add dst, dst, src
sub len, src, srcin
cmp len, #8
b.lt .Llt8
cmp len, #16
b.lt .Llt16
/* 16->32 bytes to copy. */
ldp data1, data2, [srcin]
ldp data1a, data2a, [src, #-16]
stp data1, data2, [dstin]
stp data1a, data2a, [dst, #-16]
ret
.Llt16:
/* 8->15 bytes to copy. */
ldr data1, [srcin]
ldr data2, [src, #-8]
str data1, [dstin]
str data2, [dst, #-8]
ret
.Llt8:
cmp len, #4
b.lt .Llt4
/* 4->7 bytes to copy. */
ldr data1w, [srcin]
ldr data2w, [src, #-4]
str data1w, [dstin]
str data2w, [dst, #-4]
ret
.Llt4:
cmp len, #2
b.lt .Llt2
/* 2->3 bytes to copy. */
ldrh data1w, [srcin]
strh data1w, [dstin]
/* Fall-through, one byte (max) to go. */
.Llt2:
/* Null-terminated string. Last character must be zero! */
strb wzr, [dst, #-1]
ret
.Lmisaligned:
cmp tmp1, #8
b.ge 2f
/* There's at least one Dword before we reach alignment, so we can
deal with that efficiently. */
ldr data1, [src]
bic src, src, #15
sub tmp3, data1, zeroones
orr tmp4, data1, #REP8_7f
bics has_nul1, tmp3, tmp4
b.ne .Lnul_in_data1
str data1, [dst], #8
ldr data2, [src, #8]
add src, src, #16
sub dst, dst, tmp1
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bics has_nul2, tmp3, tmp4
b.ne .Lnul_in_data2_fast_after_d1
str data2, [dst], #8
/* We can by-pass the first-pass version of the loop in this case
since we know that at least 8 bytes have already been copied. */
b .Lstartloop_fast
2:
sub tmp1, tmp1, #16
3:
ldrb data1w, [src], #1
strb data1w, [dst], #1
cbz data1w, .Ldone
add tmp1, tmp1, #1
cbnz tmp1, 3b
b .Lfirst_pass
.size strcpy, . - strcpy
#endif