This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Eric Blake <ebb9 <at> byu.net> writes:Yes, go ahead. I assume you have verified that the function still works correctly.
Maybe you could show a few timing measurements from realistic testcases atHere's my test program:
this point in the discussion? I haven't felt comfortable trying to directly
infer from instruction counts to real-time clock cycles any time in the past
two decades myself, there are so many confounding factors these days that
I'm not even confident of being able to validly reason about it any more.
I modified my test program to also check strlen numbers (in mystrchr, replace the !c branch with "return (char *) str + strlen (str);"). With strlen's current assembly implementation of 'repnz scasb' as the inner loop, modern x86 machines have HIDEOUS performance:
$ time ./foo 1000000 1 0 1000 0 1
real 0m2.859s user 0m2.874s sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 1
real 0m2.875s user 0m2.874s sys 0m0.030s
3x worse than current strchr on unaligned data, but at least alignment didn't matter.
Finally, I implemented my assembly changes (renamed as strchr1 in my testing, so I could still compare to the original strchr), and got much more promising results:
$ time ./foo 1000000 1 0 1000 2 2
real 0m0.578s user 0m0.593s sys 0m0.015s
$ time ./foo 1000000 1 1 1000 2 2
real 0m0.562s user 0m0.577s sys 0m0.015s
All right - aligned searches are not noticeably worse than the original implementation, and unaligned searches are now comparable in time to aligned searches, rather than 40% slower.
$ time ./foo 1000000 1 0 1000 0 0
real 0m0.329s user 0m0.327s sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 0
real 0m0.328s user 0m0.343s sys 0m0.015s
And the special case of searching for NUL is 40% faster!
Code size difference:
original strchr: 111 bytes my strchr: 250 bytes
So, OK to apply this patch?
2008-05-21 Eric Blake <ebb9@byu.net>
Optimize strchr for x86. * libc/machine/i386/strchr.S (strchr): Pre-align so unaligned searches aren't penalized. Special-case searching for 0.
Index: libc/machine/i386/strchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S
--- libc/machine/i386/strchr.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/strchr.S 21 May 2008 17:46:22 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -9,7 +9,7 @@
*/
#include "i386mach.h"
-
+
.global SYM (strchr)
SOTYPE_FUNCTION(strchr)
@@ -21,14 +21,45 @@ SYM (strchr):
pushl ebx
xorl ebx,ebx
movl 8(ebp),edi
- movb 12(ebp),bl
+ addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+ je L25
-#ifndef __OPTIMIZE_SIZE__
-/* check if string is aligned, if not do check one byte at a time */
+/* Do byte-wise checks until string is aligned. */
test $3,edi
- jne L9
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
/* create 4 byte mask which is just the desired byte repeated 4 times */
+L5:
movl ebx,ecx
sall $8,ebx
subl $4,edi
@@ -49,15 +80,14 @@ L10:
testl $-2139062144,edx
jne L9
- movl ebx,eax
- xorl ecx,eax
- leal -16843009(eax),edx
- notl eax
- andl eax,edx
+ xorl ebx,ecx
+ leal -16843009(ecx),edx
+ notl ecx
+ andl ecx,edx
testl $-2139062144,edx
je L10
#endif /* not __OPTIMIZE_SIZE__ */
-
+
/* loop while (*s && *s++ != c) */
L9:
leal -1(edi),eax
@@ -69,7 +99,7 @@ L15:
je L14
cmpb bl,dl
jne L15
-
+
L14:
/* if (*s == c) return address otherwise return NULL */
cmpb bl,(eax)
@@ -83,3 +113,60 @@ L19:
leave
ret
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+#if 0
+ /* Hideous performance on modern machines. */
+L25:
+ cld
+ movl $-1,ecx
+ xor eax,eax
+ repnz
+ scasb
+ leal -1(edi),eax
+ jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned. */
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+L26:
+ subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte */
+ .p2align 4,,7
+L27:
+ addl $4,edi
+ movl (edi),ecx
+ leal -16843009(ecx),edx
+ movl ecx,eax
+ notl eax
+ andl eax,edx
+ testl $-2139062144,edx
+ je L27
+
+ jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |