This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
faster memchr

From: Eric Blake <ebb9 at byu dot net>
To: newlib at sources dot redhat dot com
Date: Thu, 22 May 2008 22:55:28 +0000 (UTC)
Subject: faster memchr
Yet another speedup.

2008-05-22 Eric Blake <ebb9@byu.net>

	Optimize the generic and x86 memchr.
	* libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]: Pre-align
	pointer so unaligned searches aren't penalized.
	* libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
	operations are faster than repnz byte searches.


And the test program:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#define _CONST const

#undef memchr

void *
memchr1 (const void *src_void, int c, size_t length)
{
  _CONST unsigned char *src = (_CONST unsigned char *) src_void;
  unsigned char d = c;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  unsigned long *asrc;
  unsigned long  mask;
  int i;

  while (UNALIGNED (src))
    {
      if (!length--)
        return NULL;
      if (*src == d)
        return (void *) src;
      src++;
    }

  if (!TOO_SMALL (length))
    {
      /* If we get this far, we know that length is large and src is
         word-aligned. */
      /* The fast code reads the source one word at a time and only
         performs the bytewise search on word-sized segments if they
         contain the search character, which is detected by XORing
         the word-sized segment with a word-sized block of the search
         character and then detecting for the presence of NUL in the
         result.  */
      asrc = (unsigned long *) src;
      mask = d << 8 | d;
      mask = mask << 16 | mask;
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        mask = (mask << i) | mask;

      while (length >= LBLOCKSIZE)
        {
          if (DETECTCHAR (*asrc, mask))
            break;
          length -= LBLOCKSIZE;
          asrc++;
        }

      /* If there are fewer than LBLOCKSIZE characters left,
         then we resort to the bytewise loop.  */

      src = (unsigned char *) asrc;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (length--)
    {
      if (*src == d)
        return (void *) src;
      src++;
    }

  return NULL;
}

void *memchr0 (const void *m, int c, size_t n);

int main (int argc, char **argv)
{
  if (argc < 5)
    {
      printf ("usage: %s size repeat goal offset [func]\n", argv[0]);
      return 1;
    }
  int size = strtol (argv[1], NULL, 0);
  int repeat = strtol (argv[2], NULL, 0);
  int goal = strtol (argv[3], NULL, 0);
  int offset = strtol (argv[4], NULL, 0);
  void *(*func)(const void *, int, size_t);
  func = argc > 5 ? (atoi (argv[5]) ? memchr1 : memchr0) : memchr;

  unsigned char *buf = malloc (size);
  if (size)
    {
      __builtin_memset (buf, 1, size - 1);
      buf[size - 1] = 2;
    }

  char *expected = (goal == 1 && offset < size - 1 ? buf + offset
                    : goal == 2 && offset < size ? buf + size - 1 : NULL);
  buf += offset;
  size -= offset;
  while (repeat--)
    assert (func (buf, goal, size) == expected);
  buf -= offset;
  free (buf);
  return 0;
}

And more interesting numbers:

for i in `seq 0 25` ; do
 echo $i
 for j in `seq 0 $i` ; do
  for k in 0 1 2 ; do
   ./foo $i 1 $k $j 0
  done
 done
done

Proof that my patched assembly can handle any alignment for starting, and 
handles whether the byte is found at the beginning, at any alignment at the 
end, or not all.

$ time ./foo 1000000 1000 0 0

real	0m2.859s
user	0m2.890s
sys	0m0.000s

$ time ./foo 1000000 1000 0 1

real	0m2.859s
user	0m2.858s
sys	0m0.015s

# Unpatched assembly uses 'repnz scasb' - dirt slow, but no alignment issues

$ time ./foo 1000000 1000 0 0 1

real	0m0.453s
user	0m0.484s
sys	0m0.000s

$ time ./foo 1000000 1000 0 1 1

real	0m0.453s
user	0m0.468s
sys	0m0.015s

# Wow.  Patched C is 6x faster, with no alignment problems.

$ time ./foo 1000000 1000 0 0 0

real	0m0.391s
user	0m0.405s
sys	0m0.015s

$ time ./foo 1000000 1000 0 1 0

real	0m0.390s
user	0m0.405s
sys	0m0.015s

# and patched assembly is 15% faster than patched C, on any alignment

Without further ado, here's my 7x speedup for x86 memchr:


Index: libc/string/memchr.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/memchr.c,v
retrieving revision 1.2
diff -u -p -r1.2 memchr.c
--- libc/string/memchr.c	28 Oct 2005 21:21:07 -0000	1.2
+++ libc/string/memchr.c	22 May 2008 22:42:32 -0000
@@ -20,7 +20,7 @@ DESCRIPTION
 	This function searches memory starting at <<*<[src]>>> for the
 	character <[c]>.  The search only ends with the first
 	occurrence of <[c]>, or after <[length]> characters; in
-	particular, <<NULL>> does not terminate the search.
+	particular, <<NUL>> does not terminate the search.
 
 RETURNS
 	If the character <[c]> is found within <[length]> characters
@@ -64,6 +64,9 @@ QUICKREF
 #error long int is not a 32bit or 64bit byte
 #endif
 
+/* DETECTCHAR returns nonzero if (long)X contains the byte used
+   to fill (long)MASK. */
+#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
 
 _PTR
 _DEFUN (memchr, (src_void, c, length),
@@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
 	int c _AND
 	size_t length)
 {
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   _CONST unsigned char *src = (_CONST unsigned char *) src_void;
+  unsigned char d = c;
 
-  c &= 0xff;
-  
-  while (length--)
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
+  unsigned long *asrc;
+  unsigned long  mask;
+  int i;
+
+  while (UNALIGNED (src))
     {
-      if (*src == c)
-	return (char *) src;
+      if (!length--)
+        return NULL;
+      if (*src == d)
+        return (void *) src;
       src++;
     }
-  return NULL;
-#else
-  _CONST unsigned char *src = (_CONST unsigned char *) src_void;
-  unsigned long *asrc;
-  unsigned long  buffer;
-  unsigned long  mask;
-  int i, j;
 
-  c &= 0xff;
-  
-  /* If the size is small, or src is unaligned, then 
-     use the bytewise loop.  We can hope this is rare.  */
-  if (!TOO_SMALL (length) && !UNALIGNED (src)) 
+  if (!TOO_SMALL (length))
     {
-      /* The fast code reads the ASCII one word at a time and only 
+      /* If we get this far, we know that length is large and src is
+         word-aligned. */
+      /* The fast code reads the source one word at a time and only
          performs the bytewise search on word-sized segments if they
-         contain the search character, which is detected by XORing 
+         contain the search character, which is detected by XORing
          the word-sized segment with a word-sized block of the search
-         character and then detecting for the presence of NULL in the
+         character and then detecting for the presence of NUL in the
          result.  */
-      asrc = (unsigned long*) src;
-      mask = 0;
-      for (i = 0; i < LBLOCKSIZE; i++)
-        mask = (mask << 8) + c;
+      asrc = (unsigned long *) src;
+      mask = d << 8 | d;
+      mask = mask << 16 | mask;
+      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+        mask = (mask << i) | mask;
 
       while (length >= LBLOCKSIZE)
         {
-          buffer = *asrc;
-          buffer ^=  mask;
-          if (DETECTNULL (buffer))
-	    {
-              src = (unsigned char*) asrc;
-  	      for ( j = 0; j < LBLOCKSIZE; j++ )
-	        {
-                  if (*src == c)
-                    return (char*) src;
-                  src++;
-	        }
-   	    }
+          if (DETECTCHAR (*asrc, mask))
+            break;
           length -= LBLOCKSIZE;
           asrc++;
         }
-  
+
       /* If there are fewer than LBLOCKSIZE characters left,
          then we resort to the bytewise loop.  */
 
-      src = (unsigned char*) asrc;
+      src = (unsigned char *) asrc;
     }
 
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
   while (length--)
-    { 
-      if (*src == c)
-        return (char*) src;
+    {
+      if (*src == d)
+        return (void *) src;
       src++;
-    } 
+    }
 
   return NULL;
-#endif /* not PREFER_SIZE_OVER_SPEED */
 }
Index: libc/machine/i386/memchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/memchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 memchr.S
--- libc/machine/i386/memchr.S	20 Dec 2002 21:31:19 -0000	1.3
+++ libc/machine/i386/memchr.S	22 May 2008 22:42:32 -0000
@@ -1,6 +1,6 @@
 /*
  * ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
  *
  * Permission to use, copy, modify, and distribute this
  * software is freely granted, provided that this notice
@@ -9,21 +9,23 @@
  */
 
 	#include "i386mach.h"
-	
+
 	.global	SYM (memchr)
        SOTYPE_FUNCTION(memchr)
 
 SYM (memchr):
 	pushl	ebp
 	movl	esp,ebp
-	pushl 	edi
-	movl 	12(ebp),eax
-	movl 	16(ebp),ecx
-	movl 	8(ebp),edi
+	pushl	edi
+	movzbl	12(ebp),eax
+	movl	16(ebp),ecx
+	movl	8(ebp),edi
 
 	xorl	edx,edx
 	testl	ecx,ecx
-	jz	L1
+	jz	L20
+
+#ifdef __OPTIMIZE_SIZE__
 
 	cld
 	repnz
@@ -31,9 +33,79 @@ SYM (memchr):
 
 	setnz	dl
 	decl	edi
+
+#else /* !__OPTIMIZE_SIZE__ */
+/* Do byte-wise checks until string is aligned.  */
+	testl	$3,edi
+	je	L5
+	cmpb	(edi),al
+	je	L15
+	incl	edi
+	decl	ecx
+	je	L20
+
+	testl	$3,edi
+	je	L5
+	cmpb	(edi),al
+	je	L15
+	incl	edi
+	decl	ecx
+	je	L20
+
+	testl	$3,edi
+	je	L5
+	cmpb	(edi),al
+	je	L15
+	incl	edi
+	decl	ecx
+	je	L20
+
+/* Create a mask, then check a word at a time.  */
+L5:
+	movb	al,ah
+	movl	eax,edx
+	sall	$16,edx
+	orl	edx,eax
+	pushl	ebx
+
+	.p2align 4,,7
+L8:
+	subl	$4,ecx
+	jc	L9
+	movl	(edi),edx
+	addl	$4,edi
+	xorl	eax,edx
+	leal	-16843009(edx),ebx
+	notl	edx
+	andl	edx,ebx
+	testl	$-2139062144,ebx
+	je	L8
+
+	subl	$4,edi
+
+L9:
+	popl	ebx
+	xorl	edx,edx
+	addl	$4,ecx
+	je	L20
+
+/* Final byte-wise checks.  */
+	.p2align 4,,7
+L10:
+	cmpb	(edi),al
+	je	L15
+	incl	edi
+	decl	ecx
+	jne	L10
+
+	xorl	edi,edi
+
+#endif /* !__OPTIMIZE_SIZE__ */
+
+L15:
 	decl	edx
 	andl	edi,edx
-L1:
+L20:
 	movl	edx,eax
 
 	leal	-4(ebp),esp
Follow-Ups:
- Re: faster memchr
  - From: duane ellis
- Re: faster memchr
  - From: Jeff Johnston
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]