This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.14-349-gf9e1232


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  f9e123204e9a96d421c9f9ce011d567ce9f8e217 (commit)
       via  951fbcec70e65c49705fcdbf4630bee5ce2a5691 (commit)
      from  0ac5ae2335292908f39031b1ea9fe8edce433c0f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f9e123204e9a96d421c9f9ce011d567ce9f8e217

commit f9e123204e9a96d421c9f9ce011d567ce9f8e217
Author: Ulrich Drepper <drepper@gmail.com>
Date:   Wed Oct 12 11:42:57 2011 -0400

    Fix whitespaces

diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
index 1c2a867..355d498 100644
--- a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
+++ b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -133,7 +133,7 @@ L(loop_prolog):
 	mov	%ecx, %eax
 	and	$63, %eax
 	test	%eax, %eax
-	jz	L(align64_loop)    
+	jz	L(align64_loop)
 
 	add	$64, %ecx
 	add	$64, %edx
@@ -333,7 +333,7 @@ L(length_less16):
 	pshufd	$0, %xmm1, %xmm1
 	and	$15, %ecx
 	jz	L(length_less16_offset0)
-	
+
 	PUSH	(%edi)
 	mov	%cl, %dh
 	add	%dl, %dh
diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/sysdeps/i386/i686/multiarch/memrchr-sse2.S
index 68f4bc7..86a0cf9 100644
--- a/sysdeps/i386/i686/multiarch/memrchr-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -56,7 +56,7 @@ ENTRY (__memrchr_sse2)
 
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
-	jnz	L(exit_dispatch)  
+	jnz	L(exit_dispatch)
 
 	sub	$64, %ecx
 	mov	%ecx, %eax
@@ -129,7 +129,7 @@ L(loop_prolog):
 	mov	%ecx, %eax
 	and	$63, %eax
 	test	%eax, %eax
-	jz	L(align64_loop)    
+	jz	L(align64_loop)
 
 	lea	64(%ecx), %ecx
 	lea	64(%edx), %edx
@@ -244,7 +244,7 @@ L(exit_loop_32):
 
 	.p2align 4
 L(matches16):
-	lea	16(%ecx), %ecx	
+	lea	16(%ecx), %ecx
 	test	%ah, %ah
 	jnz	L(exit_dispatch_high)
 	mov	%al, %dl
@@ -261,7 +261,7 @@ L(matches16):
 
 	.p2align 4
 L(matches32):
-	lea	32(%ecx), %ecx	
+	lea	32(%ecx), %ecx
 	test	%ah, %ah
 	jnz	L(exit_dispatch_high)
 	mov	%al, %dl
@@ -297,7 +297,7 @@ L(exit_dispatch):
 	ret
 
 	.p2align 4
-L(exit_dispatch_8):        
+L(exit_dispatch_8):
 	test	$0x80, %al
 	jnz	L(exit_8)
 	test	$0x40, %al
@@ -437,7 +437,7 @@ L(matches16_1):
 L(matches32_1):
 	lea	-32(%edx), %edx
 	lea	32(%ecx), %ecx
-	
+
 	test	%ah, %ah
 	jnz	L(exit_dispatch_1_high)
 	mov	%al, %ah
@@ -478,7 +478,7 @@ L(exit_dispatch_1):
 	ret
 
 	.p2align 4
-L(exit_dispatch_1_8):        
+L(exit_dispatch_1_8):
 	test	$0x80, %al
 	jnz	L(exit_1_8)
 	test	$0x40, %al
@@ -639,7 +639,7 @@ L(length_less16):
 
 	and	$15, %ecx
 	jz	L(length_less16_offset0)
-	
+
 	PUSH	(%edi)
 
 	mov	%cl, %dh

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=951fbcec70e65c49705fcdbf4630bee5ce2a5691

commit 951fbcec70e65c49705fcdbf4630bee5ce2a5691
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date:   Wed Oct 12 11:42:04 2011 -0400

    Optimized memchr, memrchr, rawmemchr for x86-32

diff --git a/ChangeLog b/ChangeLog
index 5dd550c..1b7499c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2011-10-12  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	memchr-sse2 memchr-sse2-bsf memrchr-sse2 memrchr-sse2-bsf memrchr-c
+	rawmemchr-sse2 rawmemchr-sse2-bsf.
+	* sysdeps/i386/i686/multiarch/memchr.S: New file.
+	* sysdeps/i386/i686/multiarch/memchr-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S: New file.
+	* sysdeps/i386/i686/multiarch/memrchr.S: New file.
+	* sysdeps/i386/i686/multiarch/memrchr-c.c: New file.
+	* sysdeps/i386/i686/multiarch/memrchr-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S: New file.
+	* sysdeps/i386/i686/multiarch/rawmemchr.S: New file.
+	* sysdeps/i386/i686/multiarch/rawmemchr-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S: New file.
+	* string/memrchr.c (MEMRCHR): New macro.
+
 2011-10-12  Ulrich Drepper  <drepper@gmail.com>
 
 	Add integration with gcc's -ffinite-math-only and optimize wrapper
diff --git a/NEWS b/NEWS
index 9940957..e10182b 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,7 @@ Version 2.15
 * Optimized strchr and strrchr for SSE on x86-32.
   Contributed by Liubov Dmitrieva.
 
-* Optimized memchr, memrchr, rawmemchr for x86-64.
+* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
   Contributed by Liubov Dmitrieva.
 
 * New interfaces: scandirat, scandirat64
diff --git a/string/memrchr.c b/string/memrchr.c
index 21662b1..498a437 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -63,7 +63,12 @@
 
 /* Search no more than N bytes of S for C.  */
 __ptr_t
-__memrchr (s, c_in, n)
+#ifndef MEMRCHR
+__memrchr
+#else
+MEMRCHR
+#endif
+     (s, c_in, n)
      const __ptr_t s;
      int c_in;
      size_t n;
@@ -205,6 +210,8 @@ __memrchr (s, c_in, n)
 
   return 0;
 }
-#ifdef weak_alias
+#ifndef MEMRCHR
+# ifdef weak_alias
 weak_alias (__memrchr, memrchr)
+# endif
 #endif
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index c89ae92..8a4c219 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -15,7 +15,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \
 		   strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \
 		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
-		   wcscmp-sse2 wcscmp-c
+		   wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
+		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
+		   rawmemchr-sse2 rawmemchr-sse2-bsf
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
new file mode 100644
index 0000000..115a219
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
@@ -0,0 +1,497 @@
+/* Optimized memchr with sse2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2_bsf
+# endif
+
+	.text
+ENTRY (MEMCHR)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null_1)
+# endif
+	mov	%ecx, %eax
+
+	punpcklbw %xmm1, %xmm1
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+	movdqu	(%eax), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	je	L(unaligned_no_match_1)
+/* Check which byte is a match.  */
+	bsf	%ecx, %ecx
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%ecx, %edx
+	jbe	L(return_null_1)
+# endif
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(unaligned_no_match_1):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	jbe	L(return_null_1)
+	PUSH	(%edi)
+	lea	16(%eax), %edi
+	and	$15, %eax
+	and	$-16, %edi
+	add	%eax, %edx
+# else
+	lea	16(%eax), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(return_null_1):
+	xor	%eax, %eax
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+	CFI_POP	(%edi)
+# endif
+
+	.p2align 4
+L(crosscache):
+/* Handle unaligned string.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	PUSH	(%edi)
+	mov	%eax, %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	mov	%eax, %edx
+	and	$15, %ecx
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	sar	%cl, %eax
+	test	%eax, %eax
+	je	L(unaligned_no_match)
+/* Check which byte is a match.  */
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	%eax, %edx
+	jbe	L(return_null)
+	add	%edi, %eax
+	add	%ecx, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	add	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(unaligned_no_match):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$16, %edx
+	add	%ecx, %edx
+	jle	L(return_null)
+	add	$16, %edi
+# else
+	add	$16, %edx
+# endif
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	test	$0x3f, %edi
+# else
+	test	$0x3f, %edx
+# endif
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm3
+# else
+	movdqa	48(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	test	%eax, %eax
+	jnz	L(matches0)
+
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+	pmovmskb %xmm4, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+
+	pcmpeqb	%xmm1, %xmm3
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	pmovmskb %xmm1, %eax
+	bsf	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	48(%edi, %eax), %eax
+	RETURN
+# else
+	lea	48(%edx, %eax), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	16(%edi), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	xor	%eax, %eax
+	RETURN
+# endif
+	.p2align 4
+L(matches0):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	-16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	-16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	add	%edi, %eax
+	RETURN
+# else
+	add	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches16):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	16(%eax, %edi), %eax
+	RETURN
+# else
+	lea	16(%eax, %edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(matches32):
+	bsf	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	lea	32(%eax, %edi), %eax
+	RETURN
+# else
+	lea	32(%eax, %edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(matches_1):
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	add	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(matches16_1):
+	sub	$16, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	16(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches32_1):
+	sub	$32, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	32(%edi, %eax), %eax
+	RETURN
+
+	.p2align 4
+L(matches48_1):
+	sub	$48, %edx
+	bsf	%eax, %eax
+	sub	%eax, %edx
+	jbe	L(return_null)
+
+	lea	48(%edi, %eax), %eax
+	RETURN
+# endif
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2.S b/sysdeps/i386/i686/multiarch/memchr-sse2.S
new file mode 100644
index 0000000..63d1d5d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memchr-sse2.S
@@ -0,0 +1,706 @@
+/* Optimized memchr with sse2 without bsf
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef USE_AS_RAWMEMCHR
+#  define ENTRANCE PUSH(%edi);
+#  define PARMS  8
+#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
+# else
+#  define ENTRANCE
+#  define PARMS  4
+# endif
+
+# define STR1  PARMS
+# define STR2  STR1+4
+
+# ifndef USE_AS_RAWMEMCHR
+#  define LEN   STR2+4
+# endif
+
+# ifndef MEMCHR
+#  define MEMCHR __memchr_sse2
+# endif
+
+	atom_text_section
+ENTRY (MEMCHR)
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	LEN(%esp), %edx
+	test	%edx, %edx
+	jz	L(return_null)
+# endif
+
+	punpcklbw %xmm1, %xmm1
+# ifndef USE_AS_RAWMEMCHR
+	mov	%ecx, %edi
+# else
+	mov	%ecx, %edx
+# endif
+	punpcklbw %xmm1, %xmm1
+
+	and	$63, %ecx
+	pshufd	$0, %xmm1, %xmm1
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqu	(%edi), %xmm0
+# else
+	movdqu	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog)
+
+	sub	$16, %edx
+	jbe	L(return_null)
+	lea	16(%edi), %edi
+	and	$15, %ecx
+	and	$-16, %edi
+	add	%ecx, %edx
+# else
+	jnz	L(match_case1_prolog)
+	lea	16(%edx), %edx
+	and	$-16, %edx
+# endif
+	jmp	L(loop_prolog)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %ecx
+# ifndef USE_AS_RAWMEMCHR
+	and	$-16, %edi
+	movdqa	(%edi), %xmm0
+# else
+	and	$-16, %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	sar	%cl, %eax
+	test	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(match_case2_prolog1)
+	lea	-16(%edx), %edx
+	add	%ecx, %edx
+	jle	L(return_null)
+	lea	16(%edi), %edi
+# else
+	jnz	L(match_case1_prolog1)
+	lea	16(%edx), %edx
+# endif
+
+	.p2align 4
+L(loop_prolog):
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+# else
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%edi), %xmm0
+# else
+	lea	64(%edx), %edx
+	movdqa	(%edx), %xmm0
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	16(%edi), %xmm2
+# else
+	movdqa	16(%edx), %xmm2
+# endif
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm4
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	lea	64(%edi), %edi
+	mov	%edi, %ecx
+	and	$-64, %edi
+	and	$63, %ecx
+	add	%ecx, %edx
+# else
+	lea	64(%edx), %edx
+	and	$-64, %edx
+# endif
+
+	.p2align 4
+L(align64_loop):
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edx
+	jbe	L(exit_loop)
+	movdqa	(%edi), %xmm0
+	movdqa	16(%edi), %xmm2
+	movdqa	32(%edi), %xmm3
+	movdqa	48(%edi), %xmm4
+# else
+	movdqa	(%edx), %xmm0
+	movdqa	16(%edx), %xmm2
+	movdqa	32(%edx), %xmm3
+	movdqa	48(%edx), %xmm4
+# endif
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm0, %xmm3
+	pmaxub	%xmm2, %xmm4
+	pmaxub	%xmm3, %xmm4
+# ifndef USE_AS_RAWMEMCHR
+	add	$64, %edi
+# else
+	add	$64, %edx
+# endif
+	pmovmskb %xmm4, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+# ifndef USE_AS_RAWMEMCHR
+	sub	$64, %edi
+# else
+	sub	$64, %edx
+# endif
+
+	pmovmskb %xmm0, %eax
+	xor	%ecx, %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+	pmovmskb %xmm2, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	movdqa	32(%edi), %xmm3
+# else
+	movdqa	32(%edx), %xmm3
+# endif
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	lea	16(%ecx), %ecx
+	test	%eax, %eax
+	jnz	L(match_case1)
+
+# ifndef USE_AS_RAWMEMCHR
+	pcmpeqb	48(%edi), %xmm1
+# else
+	pcmpeqb	48(%edx), %xmm1
+# endif
+	pmovmskb %xmm1, %eax
+	lea	16(%ecx), %ecx
+
+	.p2align 4
+L(match_case1):
+# ifndef USE_AS_RAWMEMCHR
+	add	%ecx, %edi
+# else
+L(match_case1_prolog1):
+	add	%ecx, %edx
+L(match_case1_prolog):
+# endif
+	test	%al, %al
+	jz	L(match_case1_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case1_8)
+	test	$0x01, %al
+	jnz	L(ExitCase1_1)
+	test	$0x02, %al
+	jnz	L(ExitCase1_2)
+	test	$0x04, %al
+	jnz	L(ExitCase1_3)
+# ifndef USE_AS_RAWMEMCHR
+	lea	3(%edi), %eax
+	RETURN
+# else
+	lea	3(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_8):
+	test	$0x10, %al
+	jnz	L(ExitCase1_5)
+	test	$0x20, %al
+	jnz	L(ExitCase1_6)
+	test	$0x40, %al
+	jnz	L(ExitCase1_7)
+# ifndef USE_AS_RAWMEMCHR
+	lea	7(%edi), %eax
+	RETURN
+# else
+	lea	7(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case1_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase1_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase1_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase1_11)
+# ifndef USE_AS_RAWMEMCHR
+	lea	11(%edi), %eax
+	RETURN
+# else
+	lea	11(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(match_case1_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase1_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase1_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase1_15)
+# ifndef USE_AS_RAWMEMCHR
+	lea	15(%edi), %eax
+	RETURN
+# else
+	lea	15(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+
+	movdqa	(%edi), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	xor	%ecx, %ecx
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	movdqa	16(%edi), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$32, %edx
+	jbe	L(return_null)
+
+	movdqa	32(%edi), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	48(%edi), %xmm1
+	lea	16(%ecx), %ecx
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(match_case2)
+
+	xor	%eax, %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(ExitCase1_1):
+# ifndef USE_AS_RAWMEMCHR
+	mov	%edi, %eax
+	RETURN
+# else
+	mov	%edx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_2):
+# ifndef USE_AS_RAWMEMCHR
+	lea	1(%edi), %eax
+	RETURN
+# else
+	lea	1(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_3):
+# ifndef USE_AS_RAWMEMCHR
+	lea	2(%edi), %eax
+	RETURN
+# else
+	lea	2(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_5):
+# ifndef USE_AS_RAWMEMCHR
+	lea	4(%edi), %eax
+	RETURN
+# else
+	lea	4(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_6):
+# ifndef USE_AS_RAWMEMCHR
+	lea	5(%edi), %eax
+	RETURN
+# else
+	lea	5(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_7):
+# ifndef USE_AS_RAWMEMCHR
+	lea	6(%edi), %eax
+	RETURN
+# else
+	lea	6(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_9):
+# ifndef USE_AS_RAWMEMCHR
+	lea	8(%edi), %eax
+	RETURN
+# else
+	lea	8(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_10):
+# ifndef USE_AS_RAWMEMCHR
+	lea	9(%edi), %eax
+	RETURN
+# else
+	lea	9(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_11):
+# ifndef USE_AS_RAWMEMCHR
+	lea	10(%edi), %eax
+	RETURN
+# else
+	lea	10(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_13):
+# ifndef USE_AS_RAWMEMCHR
+	lea	12(%edi), %eax
+	RETURN
+# else
+	lea	12(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_14):
+# ifndef USE_AS_RAWMEMCHR
+	lea	13(%edi), %eax
+	RETURN
+# else
+	lea	13(%edx), %eax
+	ret
+# endif
+
+	.p2align 4
+L(ExitCase1_15):
+# ifndef USE_AS_RAWMEMCHR
+	lea	14(%edi), %eax
+	RETURN
+# else
+	lea	14(%edx), %eax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+	.p2align 4
+L(match_case2):
+	sub	%ecx, %edx
+L(match_case2_prolog1):
+	add	%ecx, %edi
+L(match_case2_prolog):
+	test	%al, %al
+	jz	L(match_case2_high)
+	mov	%al, %cl
+	and	$15, %cl
+	jz	L(match_case2_8)
+	test	$0x01, %al
+	jnz	L(ExitCase2_1)
+	test	$0x02, %al
+	jnz	L(ExitCase2_2)
+	test	$0x04, %al
+	jnz	L(ExitCase2_3)
+	sub	$4, %edx
+	jb	L(return_null)
+	lea	3(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_8):
+	test	$0x10, %al
+	jnz	L(ExitCase2_5)
+	test	$0x20, %al
+	jnz	L(ExitCase2_6)
+	test	$0x40, %al
+	jnz	L(ExitCase2_7)
+	sub	$8, %edx
+	jb	L(return_null)
+	lea	7(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high):
+	mov	%ah, %ch
+	and	$15, %ch
+	jz	L(match_case2_high_8)
+	test	$0x01, %ah
+	jnz	L(ExitCase2_9)
+	test	$0x02, %ah
+	jnz	L(ExitCase2_10)
+	test	$0x04, %ah
+	jnz	L(ExitCase2_11)
+	sub	$12, %edx
+	jb	L(return_null)
+	lea	11(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_case2_high_8):
+	test	$0x10, %ah
+	jnz	L(ExitCase2_13)
+	test	$0x20, %ah
+	jnz	L(ExitCase2_14)
+	test	$0x40, %ah
+	jnz	L(ExitCase2_15)
+	sub	$16, %edx
+	jb	L(return_null)
+	lea	15(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_1):
+	mov	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_2):
+	sub	$2, %edx
+	jb	L(return_null)
+	lea	1(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_3):
+	sub	$3, %edx
+	jb	L(return_null)
+	lea	2(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_5):
+	sub	$5, %edx
+	jb	L(return_null)
+	lea	4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_6):
+	sub	$6, %edx
+	jb	L(return_null)
+	lea	5(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_7):
+	sub	$7, %edx
+	jb	L(return_null)
+	lea	6(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_9):
+	sub	$9, %edx
+	jb	L(return_null)
+	lea	8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_10):
+	sub	$10, %edx
+	jb	L(return_null)
+	lea	9(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_11):
+	sub	$11, %edx
+	jb	L(return_null)
+	lea	10(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_13):
+	sub	$13, %edx
+	jb	L(return_null)
+	lea	12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_14):
+	sub	$14, %edx
+	jb	L(return_null)
+	lea	13(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(ExitCase2_15):
+	sub	$15, %edx
+	jb	L(return_null)
+	lea	14(%edi), %eax
+	RETURN
+# endif
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	RETURN
+# else
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memchr.S b/sysdeps/i386/i686/multiarch/memchr.S
new file mode 100644
index 0000000..163a83e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of memchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__memchr)
+	.type	__memchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__memchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__memchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__memchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__memchr)
+
+weak_alias(__memchr, memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_ia32, @function; \
+	.globl __memchr_ia32; \
+	.p2align 4; \
+	__memchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_ia32, .-__memchr_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_ia32
+
+#endif
+#include "../../memchr.S"
diff --git a/sysdeps/i386/i686/multiarch/memrchr-c.c b/sysdeps/i386/i686/multiarch/memrchr-c.c
new file mode 100644
index 0000000..44ec1a6
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memrchr-c.c
@@ -0,0 +1,7 @@
+#ifndef NOT_IN_libc
+# define MEMRCHR  __memrchr_ia32
+# include <string.h>
+extern void *__memrchr_ia32 (const void *, int, size_t);
+#endif
+
+#include "string/memrchr.c"
diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
new file mode 100644
index 0000000..1c2a867
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
@@ -0,0 +1,418 @@
+/* Optimized memrchr with sse2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+# define MEMCHR __memrchr_sse2_bsf
+
+	.text
+ENTRY (MEMCHR)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	add	$16, %ecx
+	add	$16, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches0)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)    
+
+	add	$64, %ecx
+	add	$64, %edx
+	sub	%eax, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	bsr	%eax, %eax
+
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches0):
+	bsr	%eax, %eax
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	bsr	%eax, %eax
+	lea	16(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	bsr	%eax, %eax
+	lea	32(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	bsr	%eax, %eax
+	lea	48(%eax, %ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	bsr	%eax, %eax
+	sub	$64, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	bsr	%eax, %eax
+	sub	$48, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	16(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	bsr	%eax, %eax
+	sub	$32, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	32(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	bsr	%eax, %eax
+	sub	$16, %edx
+	add	%eax, %edx
+	jl	L(return_null)
+	lea	48(%ecx, %eax), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+	mov	%edx, %ecx
+
+	pmovmskb %xmm1, %edx
+
+	and	%ecx, %edx
+	test	%edx, %edx
+	jz	L(return_null)
+
+	bsr	%edx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	mov	%ecx, %eax
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	jz	L(return_null)
+
+	pshufd	$0, %xmm1, %xmm1
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+	
+	PUSH	(%edi)
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/sysdeps/i386/i686/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000..68f4bc7
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memrchr-sse2.S
@@ -0,0 +1,725 @@
+/* Optimized memrchr with sse2 without bsf
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS  4
+# define STR1  PARMS
+# define STR2  STR1+4
+# define LEN   STR2+4
+
+	atom_text_section
+ENTRY (__memrchr_sse2)
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+	mov	LEN(%esp), %edx
+
+	sub	$16, %edx
+	jbe	L(length_less16)
+
+	punpcklbw %xmm1, %xmm1
+	add	%edx, %ecx
+	punpcklbw %xmm1, %xmm1
+
+	movdqu	(%ecx), %xmm0
+	pshufd	$0, %xmm1, %xmm1
+	pcmpeqb	%xmm1, %xmm0
+
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)  
+
+	sub	$64, %ecx
+	mov	%ecx, %eax
+	and	$15, %eax
+	jz	L(loop_prolog)
+
+	lea	16(%ecx), %ecx
+	lea	16(%edx), %edx
+	sub	%eax, %edx
+	and	$-16, %ecx
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop_prolog):
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm4
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	movdqa	(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	mov	%ecx, %eax
+	and	$63, %eax
+	test	%eax, %eax
+	jz	L(align64_loop)    
+
+	lea	64(%ecx), %ecx
+	lea	64(%edx), %edx
+	and	$-64, %ecx
+	sub	%eax, %edx
+
+	.p2align 4
+L(align64_loop):
+	sub	$64, %ecx
+	sub	$64, %edx
+	jbe	L(exit_loop)
+
+	movdqa	(%ecx), %xmm0
+	movdqa	16(%ecx), %xmm2
+	movdqa	32(%ecx), %xmm3
+	movdqa	48(%ecx), %xmm4
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm1, %xmm4
+
+	pmaxub	%xmm3, %xmm0
+	pmaxub	%xmm4, %xmm2
+	pmaxub	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+
+	test	%eax, %eax
+	jz	L(align64_loop)
+
+	pmovmskb %xmm4, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	(%ecx), %xmm1
+
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches16)
+
+	pmovmskb %xmm1, %eax
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_loop):
+	add	$64, %edx
+	cmp	$32, %edx
+	jbe	L(exit_loop_32)
+
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48)
+
+	movdqa	32(%ecx), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %eax
+	test	%eax, %eax
+	jnz	L(matches32)
+
+	movdqa	16(%ecx), %xmm3
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	test	%eax, %eax
+	jnz	L(matches16_1)
+	cmp	$48, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches0_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(exit_loop_32):
+	movdqa	48(%ecx), %xmm0
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(matches48_1)
+	cmp	$16, %edx
+	jbe	L(return_null)
+
+	pcmpeqb	32(%ecx), %xmm1
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	L(matches32_1)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(matches16):
+	lea	16(%ecx), %ecx	
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32):
+	lea	32(%ecx), %ecx	
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48):
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(exit_dispatch_8)
+	test	$0x08, %al
+	jnz	L(exit_4)
+	test	$0x04, %al
+	jnz	L(exit_3)
+	test	$0x02, %al
+	jnz	L(exit_2)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_8):        
+	test	$0x80, %al
+	jnz	L(exit_8)
+	test	$0x40, %al
+	jnz	L(exit_7)
+	test	$0x20, %al
+	jnz	L(exit_6)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(exit_dispatch_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_12)
+	test	$0x04, %ah
+	jnz	L(exit_11)
+	test	$0x02, %ah
+	jnz	L(exit_10)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_16)
+	test	$0x40, %ah
+	jnz	L(exit_15)
+	test	$0x20, %ah
+	jnz	L(exit_14)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_2):
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_3):
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_4):
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_6):
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_7):
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_8):
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_10):
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_11):
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_12):
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_14):
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_15):
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_16):
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(matches0_1):
+	lea	-64(%edx), %edx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches16_1):
+	lea	-48(%edx), %edx
+	lea	16(%ecx), %ecx
+
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches32_1):
+	lea	-32(%edx), %edx
+	lea	32(%ecx), %ecx
+	
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(matches48_1):
+	lea	-16(%edx), %edx
+	lea	48(%ecx), %ecx
+
+	.p2align 4
+L(exit_dispatch_1):
+	test	%ah, %ah
+	jnz	L(exit_dispatch_1_high)
+	mov	%al, %ah
+	and	$15 << 4, %ah
+	jnz	L(exit_dispatch_1_8)
+	test	$0x08, %al
+	jnz	L(exit_1_4)
+	test	$0x04, %al
+	jnz	L(exit_1_3)
+	test	$0x02, %al
+	jnz	L(exit_1_2)
+	add	$0, %edx
+	jl	L(return_null)
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_8):        
+	test	$0x80, %al
+	jnz	L(exit_1_8)
+	test	$0x40, %al
+	jnz	L(exit_1_7)
+	test	$0x20, %al
+	jnz	L(exit_1_6)
+	add	$4, %edx
+	jl	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high):
+	mov	%ah, %al
+	and	$15 << 4, %al
+	jnz	L(exit_dispatch_1_high_8)
+	test	$0x08, %ah
+	jnz	L(exit_1_12)
+	test	$0x04, %ah
+	jnz	L(exit_1_11)
+	test	$0x02, %ah
+	jnz	L(exit_1_10)
+	add	$8, %edx
+	jl	L(return_null)
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_dispatch_1_high_8):
+	test	$0x80, %ah
+	jnz	L(exit_1_16)
+	test	$0x40, %ah
+	jnz	L(exit_1_15)
+	test	$0x20, %ah
+	jnz	L(exit_1_14)
+	add	$12, %edx
+	jl	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_2):
+	add	$1, %edx
+	jl	L(return_null)
+	lea	1(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_3):
+	add	$2, %edx
+	jl	L(return_null)
+	lea	2(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_4):
+	add	$3, %edx
+	jl	L(return_null)
+	lea	3(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_6):
+	add	$5, %edx
+	jl	L(return_null)
+	lea	5(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_7):
+	add	$6, %edx
+	jl	L(return_null)
+	lea	6(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_8):
+	add	$7, %edx
+	jl	L(return_null)
+	lea	7(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_10):
+	add	$9, %edx
+	jl	L(return_null)
+	lea	9(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_11):
+	add	$10, %edx
+	jl	L(return_null)
+	lea	10(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_12):
+	add	$11, %edx
+	jl	L(return_null)
+	lea	11(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_14):
+	add	$13, %edx
+	jl	L(return_null)
+	lea	13(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_15):
+	add	$14, %edx
+	jl	L(return_null)
+	lea	14(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit_1_16):
+	add	$15, %edx
+	jl	L(return_null)
+	lea	15(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16_offset0):
+	mov	%dl, %cl
+	pcmpeqb	(%eax), %xmm1
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	mov	%eax, %ecx
+	pmovmskb %xmm1, %eax
+
+	and	%edx, %eax
+	test	%eax, %eax
+	jnz	L(exit_dispatch)
+
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
+L(length_less16):
+	punpcklbw %xmm1, %xmm1
+	add	$16, %edx
+	je	L(return_null)
+	punpcklbw %xmm1, %xmm1
+
+	mov	%ecx, %eax
+	pshufd	$0, %xmm1, %xmm1
+
+	and	$15, %ecx
+	jz	L(length_less16_offset0)
+	
+	PUSH	(%edi)
+
+	mov	%cl, %dh
+	add	%dl, %dh
+	and	$-16, %eax
+
+	sub	$16, %dh
+	ja	L(length_less16_part2)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	sar	%cl, %edi
+	add	%ecx, %eax
+	mov	%dl, %cl
+
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2):
+	movdqa	16(%eax), %xmm2
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %edi
+
+	mov	%cl, %ch
+
+	mov	%dh, %cl
+	mov	$1, %edx
+	sal	%cl, %edx
+	sub	$1, %edx
+
+	and	%edx, %edi
+
+	test	%edi, %edi
+	jnz	L(length_less16_part2_return)
+
+	pcmpeqb	(%eax), %xmm1
+	pmovmskb %xmm1, %edi
+
+	mov	%ch, %cl
+	sar	%cl, %edi
+	test	%edi, %edi
+	jz	L(ret_null)
+
+	bsr	%edi, %edi
+	add	%edi, %eax
+	xor	%ch, %ch
+	add	%ecx, %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(length_less16_part2_return):
+	bsr	%edi, %edi
+	lea	16(%eax, %edi), %eax
+	POP	(%edi)
+	ret
+
+	CFI_PUSH     (%edi)
+
+	.p2align 4
+L(ret_null):
+	xor	%eax, %eax
+	POP	(%edi)
+	ret
+
+END (__memrchr_sse2)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memrchr.S b/sysdeps/i386/i686/multiarch/memrchr.S
new file mode 100644
index 0000000..8e5b2c5
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memrchr.S
@@ -0,0 +1,79 @@
+/* Multiple versions of memrchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__memrchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__memrchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__memrchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__memrchr)
+
+weak_alias(__memrchr, memrchr)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
new file mode 100644
index 0000000..88c0e57
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2_bsf
+#include "memchr-sse2-bsf.S"
diff --git a/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000..038c748
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_RAWMEMCHR
+#define MEMCHR __rawmemchr_sse2
+#include "memchr-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/rawmemchr.S b/sysdeps/i386/i686/multiarch/rawmemchr.S
new file mode 100644
index 0000000..111f0dc
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -0,0 +1,99 @@
+/* Multiple versions of rawmemchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef  NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+# define CFI_POP(REG) \
+	cfi_adjust_cfa_offset (-4); \
+	cfi_restore (REG)
+
+# define CFI_PUSH(REG) \
+	cfi_adjust_cfa_offset (4); \
+	cfi_rel_offset (REG, 0)
+
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	pushl	%ebx
+	CFI_PUSH (%ebx)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	jz	3f
+
+	leal	__rawmemchr_sse2@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+2:	leal	__rawmemchr_ia32@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+
+	CFI_PUSH (%ebx)
+
+3:	leal	__rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax
+	popl	%ebx
+	CFI_POP	(%ebx)
+	ret
+END(__rawmemchr)
+
+weak_alias(__rawmemchr, rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_ia32, @function; \
+	.globl __rawmemchr_ia32; \
+	.p2align 4; \
+	__rawmemchr_ia32: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32
+
+# undef libc_hidden_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+   they will be called without setting up EBX needed for PLT which is
+   used by IFUNC.  */
+# define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32
+
+#endif
+#include "../../rawmemchr.S"

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                        |   17 +
 NEWS                                             |    2 +-
 string/memrchr.c                                 |   11 +-
 sysdeps/i386/i686/multiarch/Makefile             |    4 +-
 sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S    |  497 +++++++++++++++
 sysdeps/i386/i686/multiarch/memchr-sse2.S        |  706 +++++++++++++++++++++
 sysdeps/i386/i686/multiarch/memchr.S             |   99 +++
 sysdeps/i386/i686/multiarch/memrchr-c.c          |    7 +
 sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S   |  418 +++++++++++++
 sysdeps/i386/i686/multiarch/memrchr-sse2.S       |  725 ++++++++++++++++++++++
 sysdeps/i386/i686/multiarch/memrchr.S            |   79 +++
 sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S |    3 +
 sysdeps/i386/i686/multiarch/rawmemchr-sse2.S     |    3 +
 sysdeps/i386/i686/multiarch/rawmemchr.S          |   99 +++
 14 files changed, 2666 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S
 create mode 100644 sysdeps/i386/i686/multiarch/memchr-sse2.S
 create mode 100644 sysdeps/i386/i686/multiarch/memchr.S
 create mode 100644 sysdeps/i386/i686/multiarch/memrchr-c.c
 create mode 100644 sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
 create mode 100644 sysdeps/i386/i686/multiarch/memrchr-sse2.S
 create mode 100644 sysdeps/i386/i686/multiarch/memrchr.S
 create mode 100644 sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S
 create mode 100644 sysdeps/i386/i686/multiarch/rawmemchr-sse2.S
 create mode 100644 sysdeps/i386/i686/multiarch/rawmemchr.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]