2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.
        * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.
        * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
        Pre-align pointer so unaligned stores aren't penalized.  Prefer
        8-byte over 4-byte alignment.  Reduce register pressure.
			
			
This commit is contained in:
		| @@ -1,3 +1,12 @@ | ||||
| 2008-05-26  Eric Blake  <ebb9@byu.net> | ||||
|  | ||||
| 	Optimize the generic and x86 memset. | ||||
| 	* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: | ||||
| 	Pre-align pointer so unaligned stores aren't penalized. | ||||
| 	* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: | ||||
| 	Pre-align pointer so unaligned stores aren't penalized.  Prefer | ||||
| 	8-byte over 4-byte alignment.  Reduce register pressure. | ||||
| 						 | ||||
| 2008-05-26  Eric Blake  <ebb9@byu.net> | ||||
|  | ||||
| 	Optimize the generic and x86 strlen. | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| /* | ||||
|  * ==================================================== | ||||
|  * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved. | ||||
|  * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved. | ||||
|  * | ||||
|  * Permission to use, copy, modify, and distribute this | ||||
|  * software is freely granted, provided that this notice | ||||
| @@ -18,43 +18,83 @@ SYM (memset): | ||||
| 	pushl ebp | ||||
| 	movl esp,ebp | ||||
| 	pushl edi | ||||
| 	pushl ebx | ||||
| 	movl 8(ebp),edi | ||||
| 	movl 12(ebp),eax | ||||
| 	movl 16(ebp),ecx | ||||
| 	cld | ||||
|  | ||||
| #ifndef __OPTIMIZE_SIZE__ | ||||
| 	andl $255,eax | ||||
| 	movl ecx,ebx | ||||
| 	testl $3,edi | ||||
| 	jne .L19 | ||||
| /* Less than 16 bytes won't benefit from the 'rep stosl' loop.  */ | ||||
| 	cmpl $16,ecx | ||||
| 	jbe .L19 | ||||
| 	cbw | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movl eax,edx | ||||
| 	sall $8,eax | ||||
| 	orl edx,eax | ||||
| /* It turns out that 8-byte aligned 'rep stosl' outperforms | ||||
|    4-byte aligned on some x86 platforms.  */ | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
| 	testl $7,edi | ||||
| 	je .L10 | ||||
|  | ||||
| 	movb al,(edi) | ||||
| 	incl edi | ||||
| 	decl ecx | ||||
|  | ||||
| /* At this point, ecx>8 and edi%8==0.  */ | ||||
| .L10: | ||||
| 	movb al,ah | ||||
| 	movl eax,edx | ||||
| 	sall $16,edx | ||||
| 	orl edx,eax | ||||
|  | ||||
| 	movl ecx,edx | ||||
| 	shrl $2,ecx | ||||
| 	andl $3,ebx | ||||
| 	andl $3,edx | ||||
| 	rep | ||||
| 	stosl | ||||
| 	movl ebx,ecx | ||||
| 	movl edx,ecx | ||||
| #endif /* not __OPTIMIZE_SIZE__ */ | ||||
| 	 | ||||
|  | ||||
| .L19: | ||||
| 	rep | ||||
| 	stosb | ||||
|  | ||||
| 	movl 8(ebp),eax | ||||
|  | ||||
| 	leal -8(ebp),esp | ||||
| 	popl ebx | ||||
| 	leal -4(ebp),esp | ||||
| 	popl edi | ||||
| 	leave | ||||
| 	ret | ||||
|   | ||||
| @@ -22,7 +22,7 @@ DESCRIPTION | ||||
| 	pointed to by <[dst]> to the value. | ||||
|  | ||||
| RETURNS | ||||
| 	<<memset>> returns the value of <[m]>. | ||||
| 	<<memset>> returns the value of <[dst]>. | ||||
|  | ||||
| PORTABILITY | ||||
| <<memset>> is ANSI C. | ||||
| @@ -39,48 +39,42 @@ QUICKREF | ||||
| #define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1)) | ||||
| #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) | ||||
|  | ||||
| _PTR  | ||||
| _PTR | ||||
| _DEFUN (memset, (m, c, n), | ||||
| 	_PTR m _AND | ||||
| 	int c _AND | ||||
| 	size_t n) | ||||
| { | ||||
| #if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) | ||||
|   char *s = (char *) m; | ||||
|  | ||||
|   while (n-- != 0) | ||||
|     { | ||||
|       *s++ = (char) c; | ||||
|     } | ||||
|  | ||||
|   return m; | ||||
| #else | ||||
|   char *s = (char *) m; | ||||
| #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) | ||||
|   int i; | ||||
|   unsigned long buffer; | ||||
|   unsigned long *aligned_addr; | ||||
|   unsigned int d = c & 0xff;	/* To avoid sign extension, copy C to an | ||||
| 				   unsigned variable.  */ | ||||
|  | ||||
|   if (!TOO_SMALL (n) && !UNALIGNED (m)) | ||||
|   while (UNALIGNED (s)) | ||||
|     { | ||||
|       /* If we get this far, we know that n is large and m is word-aligned. */ | ||||
|       aligned_addr = (unsigned long*)m; | ||||
|       if (n--) | ||||
|         *s++ = (char) c; | ||||
|       else | ||||
|         return m; | ||||
|     } | ||||
|  | ||||
|   if (!TOO_SMALL (n)) | ||||
|     { | ||||
|       /* If we get this far, we know that n is large and s is word-aligned. */ | ||||
|       aligned_addr = (unsigned long *) s; | ||||
|  | ||||
|       /* Store D into each char sized location in BUFFER so that | ||||
|          we can set large blocks quickly.  */ | ||||
|       if (LBLOCKSIZE == 4) | ||||
|         { | ||||
|           buffer = (d << 8) | d; | ||||
|           buffer |= (buffer << 16); | ||||
|         } | ||||
|       else | ||||
|         { | ||||
|           buffer = 0; | ||||
|           for (i = 0; i < LBLOCKSIZE; i++) | ||||
| 	    buffer = (buffer << 8) | d; | ||||
|         } | ||||
|       buffer = (d << 8) | d; | ||||
|       buffer |= (buffer << 16); | ||||
|       for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) | ||||
|         buffer = (buffer << i) | buffer; | ||||
|  | ||||
|       /* Unroll the loop.  */ | ||||
|       while (n >= LBLOCKSIZE*4) | ||||
|         { | ||||
|           *aligned_addr++ = buffer; | ||||
| @@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n), | ||||
|       s = (char*)aligned_addr; | ||||
|     } | ||||
|  | ||||
| #endif /* not PREFER_SIZE_OVER_SPEED */ | ||||
|  | ||||
|   while (n--) | ||||
|     { | ||||
|       *s++ = (char)d; | ||||
|     } | ||||
|     *s++ = (char) c; | ||||
|  | ||||
|   return m; | ||||
| #endif /* not PREFER_SIZE_OVER_SPEED */ | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user