2008-05-26 Eric Blake <ebb9@byu.net>

Optimize the generic and x86 memchr.
        * libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned searches aren't penalized.
        * libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
        operations are faster than repnz byte searches.
This commit is contained in:
Jeff Johnston 2008-05-26 23:31:08 +00:00
parent a6bd72a278
commit 70bff2d503
3 changed files with 124 additions and 53 deletions

View File

@ -1,3 +1,11 @@
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memchr.
* libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned searches aren't penalized.
* libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
operations are faster than repnz byte searches.
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.

View File

@ -1,6 +1,6 @@
/*
* ====================================================
* Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
* Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@ -9,21 +9,23 @@
*/
#include "i386mach.h"
.global SYM (memchr)
SOTYPE_FUNCTION(memchr)
SYM (memchr):
pushl ebp
movl esp,ebp
pushl edi
movl 12(ebp),eax
movl 16(ebp),ecx
movl 8(ebp),edi
pushl edi
movzbl 12(ebp),eax
movl 16(ebp),ecx
movl 8(ebp),edi
xorl edx,edx
testl ecx,ecx
jz L1
jz L20
#ifdef __OPTIMIZE_SIZE__
cld
repnz
@ -31,9 +33,79 @@ SYM (memchr):
setnz dl
decl edi
#else /* !__OPTIMIZE_SIZE__ */
/* Do byte-wise checks until string is aligned. */
testl $3,edi
je L5
cmpb (edi),al
je L15
incl edi
decl ecx
je L20
testl $3,edi
je L5
cmpb (edi),al
je L15
incl edi
decl ecx
je L20
testl $3,edi
je L5
cmpb (edi),al
je L15
incl edi
decl ecx
je L20
/* Create a mask, then check a word at a time. */
L5:
movb al,ah
movl eax,edx
sall $16,edx
orl edx,eax
pushl ebx
.p2align 4,,7
L8:
subl $4,ecx
jc L9
movl (edi),edx
addl $4,edi
xorl eax,edx
leal -16843009(edx),ebx
notl edx
andl edx,ebx
testl $-2139062144,ebx
je L8
subl $4,edi
L9:
popl ebx
xorl edx,edx
addl $4,ecx
je L20
/* Final byte-wise checks. */
.p2align 4,,7
L10:
cmpb (edi),al
je L15
incl edi
decl ecx
jne L10
xorl edi,edi
#endif /* !__OPTIMIZE_SIZE__ */
L15:
decl edx
andl edi,edx
L1:
L20:
movl edx,eax
leal -4(ebp),esp

View File

@ -20,7 +20,7 @@ DESCRIPTION
This function searches memory starting at <<*<[src]>>> for the
character <[c]>. The search only ends with the first
occurrence of <[c]>, or after <[length]> characters; in
particular, <<NULL>> does not terminate the search.
particular, <<NUL>> does not terminate the search.
RETURNS
If the character <[c]> is found within <[length]> characters
@ -64,6 +64,9 @@ QUICKREF
#error long int is not a 32bit or 64bit byte
#endif
/* DETECTCHAR returns nonzero if (long)X contains the byte used
to fill (long)MASK. */
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
_PTR
_DEFUN (memchr, (src_void, c, length),
@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
int c _AND
size_t length)
{
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
_CONST unsigned char *src = (_CONST unsigned char *) src_void;
unsigned char d = c;
c &= 0xff;
while (length--)
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
unsigned long *asrc;
unsigned long mask;
int i;
while (UNALIGNED (src))
{
if (*src == c)
return (char *) src;
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
return NULL;
#else
_CONST unsigned char *src = (_CONST unsigned char *) src_void;
unsigned long *asrc;
unsigned long buffer;
unsigned long mask;
int i, j;
c &= 0xff;
/* If the size is small, or src is unaligned, then
use the bytewise loop. We can hope this is rare. */
if (!TOO_SMALL (length) && !UNALIGNED (src))
if (!TOO_SMALL (length))
{
/* The fast code reads the ASCII one word at a time and only
/* If we get this far, we know that length is large and src is
word-aligned. */
/* The fast code reads the source one word at a time and only
performs the bytewise search on word-sized segments if they
contain the search character, which is detected by XORing
contain the search character, which is detected by XORing
the word-sized segment with a word-sized block of the search
character and then detecting for the presence of NULL in the
character and then detecting for the presence of NUL in the
result. */
asrc = (unsigned long*) src;
mask = 0;
for (i = 0; i < LBLOCKSIZE; i++)
mask = (mask << 8) + c;
asrc = (unsigned long *) src;
mask = d << 8 | d;
mask = mask << 16 | mask;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
while (length >= LBLOCKSIZE)
{
buffer = *asrc;
buffer ^= mask;
if (DETECTNULL (buffer))
{
src = (unsigned char*) asrc;
for ( j = 0; j < LBLOCKSIZE; j++ )
{
if (*src == c)
return (char*) src;
src++;
}
}
if (DETECTCHAR (*asrc, mask))
break;
length -= LBLOCKSIZE;
asrc++;
}
/* If there are fewer than LBLOCKSIZE characters left,
then we resort to the bytewise loop. */
src = (unsigned char*) asrc;
src = (unsigned char *) asrc;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (length--)
{
if (*src == c)
return (char*) src;
{
if (*src == d)
return (void *) src;
src++;
}
}
return NULL;
#endif /* not PREFER_SIZE_OVER_SPEED */
}