This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[RFC] strchr using unaligned loads
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Wed, 10 Oct 2012 15:21:02 +0200
- Subject: [RFC] strchr using unaligned loads
Hello,
I played with unalinged loads and now I have implementation that looks
faster than current even for small inputs.
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_nehalem/strchr/html/test.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_sandy_bridge/strchr/html/test.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/strchr/html/test.html
This is asymptoticaly twice faster for large strings but for strchr causes bottleneck
when you call it often and get small shifts.
I would like to extend this to other processors but my attempts were
unsucessful( for example strchr_new variant on
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strchr/html/test.html
* sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S:
New pminub based strchr implementation.
* sysdeps/x86_64/multiarch/strchr.S(strchr):
Select __strchr_sse2_unaligned variant on i7 and fx10.
* sysdeps/x86_64/multiarch/Makefile: updated
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S | 179 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/strchr.S | 5 +-
3 files changed, 185 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..8d2175a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,8 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub strnlen-sse2-no-bsf \
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf strchr-sse2-unaligned \
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S
new file mode 100644
index 0000000..46e0c50
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-sse2-unaligned.S
@@ -0,0 +1,179 @@
+/* strchr based on pminub with unaligned start.
+ Copyright (C) 2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+
+/* Main idea is that if we first apply function min(x,x^c) to each
+ character x then resulting string has 0 where original string
+ had 0 or c. Then we find first 0 like in strlen.
+
+*/
+.text
+ENTRY (__strchr_sse2_unaligned)
+ movq %rdi, %rax
+ /* handle negative characters */
+ andl $255, %esi
+ movd %esi, %xmm1
+ punpcklbw %xmm1, %xmm1
+ punpcklwd %xmm1, %xmm1
+ pxor %xmm9, %xmm9
+ movq %rdi, %rcx
+ andq $4095, %rcx
+ pshufd $0, %xmm1, %xmm1
+ cmpq $4031, %rcx
+ ja .next
+
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm4
+ movdqu 16(%rax), %xmm0
+ movdqu %xmm0, %xmm3
+ pmovmskb %xmm4, %r9d
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm3
+ movdqu 32(%rax), %xmm0
+ movdqu %xmm0, %xmm2
+ pmovmskb %xmm3, %edx
+ pcmpeqb %xmm1, %xmm2
+ salq $16, %rdx
+ orq %r9, %rdx
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm2
+ movdqu 48(%rax), %xmm0
+ movdqu %xmm0, %xmm5
+ pmovmskb %xmm2, %r8d
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm5
+ pmovmskb %xmm5, %ecx
+ andq $-64, %rax
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+ testq %rdx,%rdx
+ je .L9
+
+ movq %rdi , %rax
+ .L2:
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ movzbl (%rax), %edx
+ cmpl %esi, %edx
+ movl $0, %edx
+ cmovne %rdx, %rax
+ ret
+ .p2align 4,,10
+ .p2align 3
+ .next:
+ andq $-64, %rax
+ movdqa (%rax), %xmm0
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm4
+ movdqa 16(%rax), %xmm0
+ movdqa %xmm0, %xmm3
+ pmovmskb %xmm4, %r9d
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm3
+ movdqa 32(%rax), %xmm0
+ movdqa %xmm0, %xmm2
+ pmovmskb %xmm3, %edx
+ pcmpeqb %xmm1, %xmm2
+ salq $16, %rdx
+ orq %r9, %rdx
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm2
+ movdqa 48(%rax), %xmm0
+ movdqa %xmm0, %xmm5
+ pmovmskb %xmm2, %r8d
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm9, %xmm0
+ por %xmm0, %xmm5
+ pmovmskb %xmm5, %ecx
+ salq $16, %rcx
+ orq %r8, %rcx
+ movq $-1, %r8
+ salq $32, %rcx
+ orq %rcx, %rdx
+ movl %edi, %ecx
+ andl $63, %ecx
+ salq %cl, %r8
+ andq %r8, %rdx
+ je .L9
+ jmp .L2
+
+ .p2align 4,,10
+ .p2align 3
+.L9:
+ addq $64, %rax
+ movdqa (%rax), %xmm5
+ #prefetcht0 512(%rax)
+ movdqa 16(%rax), %xmm2
+ movdqa 32(%rax), %xmm3
+ pxor %xmm1, %xmm5
+ movdqa 48(%rax), %xmm4
+ pxor %xmm1, %xmm2
+ pxor %xmm1, %xmm3
+ pminub (%rax), %xmm5
+ pxor %xmm1, %xmm4
+ pminub 16(%rax), %xmm2
+ pminub 32(%rax), %xmm3
+ pminub %xmm2, %xmm5
+ pminub 48(%rax), %xmm4
+ pminub %xmm3, %xmm5
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm9, %xmm5
+ pmovmskb %xmm5, %edx
+ testl %edx, %edx
+ je .L9
+
+ movdqa (%rax), %xmm5
+ pxor %xmm1, %xmm5
+ pminub (%rax), %xmm5
+ pcmpeqb %xmm9, %xmm4
+ pcmpeqb %xmm9, %xmm2
+ pcmpeqb %xmm9, %xmm5
+ pmovmskb %xmm4, %edx
+ pcmpeqb %xmm9, %xmm3
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm5, %edi
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %r8
+ orq %rcx, %rdx
+ orq %rdi, %r8
+ salq $32, %rdx
+ orq %r8, %rdx
+ jmp .L2
+
+END (__strchr_sse2_unaligned)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index b9f88e4..dcf8e03 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -28,7 +28,10 @@ ENTRY(strchr)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq __strchr_sse2(%rip), %rax
+1: leaq __strchr_sse2_unaligned(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+ leaq __strchr_sse2(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f
leaq __strchr_sse42(%rip), %rax
--
1.7.4.4