2008-05-26 Eric Blake <ebb9@byu.net>

Optimize the generic and x86 memset.
        * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.
        * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.  Prefer
        8-byte over 4-byte alignment.  Reduce register pressure.
This commit is contained in:
Jeff Johnston 2008-05-26 23:23:15 +00:00
parent cae28869c1
commit a6bd72a278
3 changed files with 85 additions and 43 deletions

View File

@ -1,3 +1,12 @@
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized.
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized. Prefer
8-byte over 4-byte alignment. Reduce register pressure.
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 strlen.

View File

@ -1,6 +1,6 @@
/*
* ====================================================
* Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
* Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@ -18,43 +18,83 @@ SYM (memset):
pushl ebp
movl esp,ebp
pushl edi
pushl ebx
movl 8(ebp),edi
movl 12(ebp),eax
movl 16(ebp),ecx
cld
#ifndef __OPTIMIZE_SIZE__
andl $255,eax
movl ecx,ebx
testl $3,edi
jne .L19
/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
cmpl $16,ecx
jbe .L19
cbw
testl $7,edi
je .L10
movl eax,edx
sall $8,eax
orl edx,eax
/* It turns out that 8-byte aligned 'rep stosl' outperforms
4-byte aligned on some x86 platforms. */
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
testl $7,edi
je .L10
movb al,(edi)
incl edi
decl ecx
/* At this point, ecx>8 and edi%8==0. */
.L10:
movb al,ah
movl eax,edx
sall $16,edx
orl edx,eax
movl ecx,edx
shrl $2,ecx
andl $3,ebx
andl $3,edx
rep
stosl
movl ebx,ecx
movl edx,ecx
#endif /* not __OPTIMIZE_SIZE__ */
.L19:
rep
stosb
movl 8(ebp),eax
leal -8(ebp),esp
popl ebx
leal -4(ebp),esp
popl edi
leave
ret

View File

@ -22,7 +22,7 @@ DESCRIPTION
pointed to by <[dst]> to the value.
RETURNS
<<memset>> returns the value of <[m]>.
<<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
@ -39,48 +39,42 @@ QUICKREF
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
_PTR
_PTR
_DEFUN (memset, (m, c, n),
_PTR m _AND
int c _AND
size_t n)
{
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
char *s = (char *) m;
while (n-- != 0)
{
*s++ = (char) c;
}
return m;
#else
char *s = (char *) m;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
if (!TOO_SMALL (n) && !UNALIGNED (m))
while (UNALIGNED (s))
{
/* If we get this far, we know that n is large and m is word-aligned. */
aligned_addr = (unsigned long*)m;
if (n--)
*s++ = (char) c;
else
return m;
}
if (!TOO_SMALL (n))
{
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
if (LBLOCKSIZE == 4)
{
buffer = (d << 8) | d;
buffer |= (buffer << 16);
}
else
{
buffer = 0;
for (i = 0; i < LBLOCKSIZE; i++)
buffer = (buffer << 8) | d;
}
buffer = (d << 8) | d;
buffer |= (buffer << 16);
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
buffer = (buffer << i) | buffer;
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
{
*s++ = (char)d;
}
*s++ = (char) c;
return m;
#endif /* not PREFER_SIZE_OVER_SPEED */
}