This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: rawmemchr


Eric Blake wrote:
Eric Blake <ebb9 <at> byu.net> writes:

Maybe you could show a few timing measurements from realistic testcases at
this point in the discussion? I haven't felt comfortable trying to directly
infer from instruction counts to real-time clock cycles any time in the past
two decades myself, there are so many confounding factors these days that
I'm not even confident of being able to validly reason about it any more.
Here's my test program:

I modified my test program to also check strlen numbers (in mystrchr, replace the !c branch with "return (char *) str + strlen (str);"). With strlen's current assembly implementation of 'repnz scasb' as the inner loop, modern x86 machines have HIDEOUS performance:


$ time ./foo 1000000 1 0 1000 0 1

real	0m2.859s
user	0m2.874s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 1

real	0m2.875s
user	0m2.874s
sys	0m0.030s

3x worse than current strchr on unaligned data, but at least alignment didn't matter.

Finally, I implemented my assembly changes (renamed as strchr1 in my testing, so I could still compare to the original strchr), and got much more promising results:


$ time ./foo 1000000 1 0 1000 2 2


real	0m0.578s
user	0m0.593s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 2 2

real	0m0.562s
user	0m0.577s
sys	0m0.015s

All right - aligned searches are not noticeably worse than the original implementation, and unaligned searches are now comparable in time to aligned searches, rather than 40% slower.

$ time ./foo 1000000 1 0 1000 0 0

real	0m0.329s
user	0m0.327s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 0

real	0m0.328s
user	0m0.343s
sys	0m0.015s

And the special case of searching for NUL is 40% faster!

Code size difference:

original strchr: 111 bytes
my strchr: 250 bytes

So, OK to apply this patch?

Yes, go ahead. I assume you have verified that the function still works correctly.

-- Jeff J.
2008-05-21 Eric Blake <ebb9@byu.net>

	Optimize strchr for x86.
	* libc/machine/i386/strchr.S (strchr): Pre-align so unaligned
	searches aren't penalized.  Special-case searching for 0.


Index: libc/machine/i386/strchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S
--- libc/machine/i386/strchr.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/strchr.S 21 May 2008 17:46:22 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -9,7 +9,7 @@
*/
#include "i386mach.h"
-
+
.global SYM (strchr)
SOTYPE_FUNCTION(strchr)
@@ -21,14 +21,45 @@ SYM (strchr):
pushl ebx
xorl ebx,ebx
movl 8(ebp),edi
- movb 12(ebp),bl
+ addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+ je L25
-#ifndef __OPTIMIZE_SIZE__
-/* check if string is aligned, if not do check one byte at a time */
+/* Do byte-wise checks until string is aligned. */
test $3,edi
- jne L9
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
/* create 4 byte mask which is just the desired byte repeated 4 times */
+L5:
movl ebx,ecx
sall $8,ebx
subl $4,edi
@@ -49,15 +80,14 @@ L10:
testl $-2139062144,edx
jne L9
- movl ebx,eax
- xorl ecx,eax
- leal -16843009(eax),edx
- notl eax
- andl eax,edx
+ xorl ebx,ecx
+ leal -16843009(ecx),edx
+ notl ecx
+ andl ecx,edx
testl $-2139062144,edx
je L10
#endif /* not __OPTIMIZE_SIZE__ */
-
+
/* loop while (*s && *s++ != c) */
L9:
leal -1(edi),eax
@@ -69,7 +99,7 @@ L15:
je L14
cmpb bl,dl
jne L15
-
+
L14:
/* if (*s == c) return address otherwise return NULL */
cmpb bl,(eax)
@@ -83,3 +113,60 @@ L19:
leave
ret
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+#if 0
+ /* Hideous performance on modern machines. */
+L25:
+ cld
+ movl $-1,ecx
+ xor eax,eax
+ repnz
+ scasb
+ leal -1(edi),eax
+ jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned. */
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+L26:
+ subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte */
+ .p2align 4,,7
+L27:
+ addl $4,edi
+ movl (edi),ecx
+ leal -16843009(ecx),edx
+ movl ecx,eax
+ notl eax
+ andl eax,edx
+ testl $-2139062144,edx
+ je L27
+
+ jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */





Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]