This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC] strchr using unaligned loads


Hello, 

I played with unalinged loads and now I have implementation that looks
faster than current even for small inputs. 

http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_nehalem/strchr/html/test.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_sandy_bridge/strchr/html/test.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/strchr/html/test.html

This is asymptoticaly twice faster for large strings but for strchr causes bottleneck 
when you call it often and get small shifts.

I would like to extend this to other processors but my attempts were
unsucessful( for example strchr_new variant on
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strchr/html/test.html


	* sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S:
	  New pminub based strchr implementation.
	* sysdeps/x86_64/multiarch/strchr.S(strchr):
	  Select __strchr_sse2_unaligned variant on i7 and fx10.
	* sysdeps/x86_64/multiarch/Makefile: updated
---
 sysdeps/x86_64/multiarch/Makefile                |    4 +-
 sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S |  179 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/strchr.S                |    5 +-
 3 files changed, 185 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..8d2175a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,8 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub strnlen-sse2-no-bsf \
+      strrchr-sse2-no-bsf strchr-sse2-no-bsf strchr-sse2-unaligned \
 		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S
new file mode 100644
index 0000000..46e0c50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S
@@ -0,0 +1,179 @@
+/* strchr based on pminub with unaligned start.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef  NOT_IN_libc
+
+# include <sysdep.h>
+
+
+/*  Main idea is that if we first apply function min(x,x^c) to each 
+  character x then resulting string has 0 where original string 
+  had 0 or c. Then we find first 0 like in strlen. 
+
+*/
+.text
+ENTRY (__strchr_sse2_unaligned)
+  movq  %rdi, %rax
+  /* handle negative characters */
+  andl  $255,  %esi
+  movd  %esi, %xmm1
+  punpcklbw %xmm1, %xmm1
+  punpcklwd %xmm1, %xmm1
+  pxor  %xmm9, %xmm9
+  movq  %rdi, %rcx
+  andq $4095, %rcx
+  pshufd  $0, %xmm1, %xmm1
+  cmpq $4031, %rcx
+  ja .next
+
+  movdqu  (%rax), %xmm0
+  movdqu  %xmm0, %xmm4
+  pcmpeqb %xmm1, %xmm4
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm4
+  movdqu  16(%rax), %xmm0
+  movdqu  %xmm0, %xmm3
+  pmovmskb  %xmm4, %r9d
+  pcmpeqb %xmm1, %xmm3
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm3
+  movdqu  32(%rax), %xmm0
+  movdqu  %xmm0, %xmm2
+  pmovmskb  %xmm3, %edx
+  pcmpeqb %xmm1, %xmm2
+  salq  $16, %rdx
+  orq %r9, %rdx
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm2
+  movdqu  48(%rax), %xmm0
+  movdqu  %xmm0, %xmm5
+  pmovmskb  %xmm2, %r8d
+  pcmpeqb %xmm1, %xmm5
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm5
+  pmovmskb  %xmm5, %ecx
+  andq  $-64, %rax
+  salq  $16, %rcx
+  orq %r8, %rcx
+  salq  $32, %rcx
+  orq %rcx, %rdx
+  testq %rdx,%rdx
+  je .L9
+
+  movq %rdi , %rax
+ .L2:
+  bsfq  %rdx, %rdx
+  addq  %rdx, %rax
+  movzbl  (%rax), %edx
+  cmpl  %esi, %edx
+  movl  $0, %edx
+  cmovne  %rdx, %rax
+  ret
+  .p2align 4,,10
+  .p2align 3
+  .next:
+  andq  $-64, %rax
+  movdqa  (%rax), %xmm0
+  movdqa  %xmm0, %xmm4
+  pcmpeqb %xmm1, %xmm4
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm4
+  movdqa  16(%rax), %xmm0
+  movdqa  %xmm0, %xmm3
+  pmovmskb  %xmm4, %r9d
+  pcmpeqb %xmm1, %xmm3
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm3
+  movdqa  32(%rax), %xmm0
+  movdqa  %xmm0, %xmm2
+  pmovmskb  %xmm3, %edx
+  pcmpeqb %xmm1, %xmm2
+  salq  $16, %rdx
+  orq %r9, %rdx
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm2
+  movdqa  48(%rax), %xmm0
+  movdqa  %xmm0, %xmm5
+  pmovmskb  %xmm2, %r8d
+  pcmpeqb %xmm1, %xmm5
+  pcmpeqb %xmm9, %xmm0
+  por     %xmm0, %xmm5
+  pmovmskb  %xmm5, %ecx
+  salq  $16, %rcx
+  orq %r8, %rcx
+  movq  $-1, %r8
+  salq  $32, %rcx
+  orq %rcx, %rdx
+  movl  %edi, %ecx
+  andl  $63, %ecx
+  salq  %cl, %r8
+  andq  %r8, %rdx
+  je  .L9
+  jmp .L2
+
+  .p2align 4,,10
+  .p2align 3
+.L9:
+  addq  $64, %rax
+  movdqa  (%rax), %xmm5
+  #prefetcht0 512(%rax)
+  movdqa  16(%rax), %xmm2
+  movdqa  32(%rax), %xmm3
+  pxor  %xmm1, %xmm5
+  movdqa  48(%rax), %xmm4
+  pxor  %xmm1, %xmm2
+  pxor  %xmm1, %xmm3
+  pminub  (%rax), %xmm5
+  pxor  %xmm1, %xmm4
+  pminub  16(%rax), %xmm2
+  pminub  32(%rax), %xmm3
+  pminub  %xmm2, %xmm5
+  pminub  48(%rax), %xmm4
+  pminub  %xmm3, %xmm5
+  pminub  %xmm4, %xmm5
+  pcmpeqb %xmm9, %xmm5
+  pmovmskb  %xmm5, %edx
+  testl %edx, %edx
+  je  .L9
+
+  movdqa  (%rax), %xmm5
+  pxor  %xmm1, %xmm5
+  pminub  (%rax), %xmm5
+  pcmpeqb %xmm9, %xmm4
+  pcmpeqb %xmm9, %xmm2
+  pcmpeqb %xmm9, %xmm5
+  pmovmskb  %xmm4, %edx
+  pcmpeqb %xmm9, %xmm3
+  pmovmskb  %xmm2, %r8d
+  pmovmskb  %xmm5, %edi
+  pmovmskb  %xmm3, %ecx
+  salq  $16, %rdx
+  salq  $16, %r8
+  orq %rcx, %rdx
+  orq %rdi, %r8
+  salq  $32, %rdx
+  orq %r8, %rdx
+  jmp .L2
+
+END (__strchr_sse2_unaligned)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index b9f88e4..dcf8e03 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -28,7 +28,10 @@ ENTRY(strchr)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__strchr_sse2(%rip), %rax
+1:	leaq  __strchr_sse2_unaligned(%rip), %rax
+  testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+  jnz 3f
+	leaq	__strchr_sse2(%rip), %rax
 	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
 	jz	2f
 	leaq	__strchr_sse42(%rip), %rax
-- 
1.7.4.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]