This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] vectorized string functions
On Wed, Jul 11, 2012 at 06:34:11PM +0400, Dmitrieva Liubov wrote:
> Ondrej,
>
> >> +sysdep_routines += strnlen strnlen_sse2 strnlen_ssse3 strnlen_sse4_1
> >> + CFLAGS-strnlen_ssse3.c += -mssse3
> >> + CFLAGS-strnlen_sse4_1.c += -msse4
>
> It seems to me that sometimes you produces too many versions.
Ok, I revised generation to write only versions that currently make
difference.
>
> Strnlen example:
> Objdump shows strnlen_sse2 and strnlen_ssse3 are exactly the same. (No
> any SSSE3 instruction GCC compiler generates)
> strnlen_sse4_1 differs from others only with ptest instead of pmovmskb
> + testl pair but it's known that this almost no affect performance but
> we've got IFUNC wrapper overhead.
I would expect that more impact would could be by
ptest being one byte less than pmovmskb + testl
>
> >> delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
>
> And we should check regressions on atom machine before removing no_bsf
> atom specific version.
I could add atom specific bsf implementation. Only first_bit function
needs to be changed. Do you know a fast implementation?
My current idea is first compute t^(t-1) to get 1 for
precisely trailing zeros and then compute index by multiplication and
array lookup.
diff --git a/sysdeps/x86_64/sse.h b/sysdeps/x86_64/sse.h
index 8db6d81..d77e56a 100644
--- a/sysdeps/x86_64/sse.h
+++ b/sysdeps/x86_64/sse.h
@@ -24,7 +24,14 @@ typedef unsigned long tp_mask;
SI tp_mask get_mask(tp_vector x){ return (tp_mask)((unsigned int)_mm_movemask_epi8(x)); }
SI unsigned int NONZERO_MASK(tp_vector x){ return _HAS_SSE4_1(!_mm_testz_si128(x,x),get_mask(x)); }
-SI tp_mask first_bit(tp_mask t,int y){ return __builtin_ctzl(t);}
+SI tp_mask first_bit(tp_mask t,int y){
+#ifdef USE_SSE2_NO_BSF
+ tp_mask ones=t^(t-1);
+ return tab[(c*ones)>>56];
+#else
+ return __builtin_ctzl(t);
+#endif
+}
SI tp_mask bit_i(int i){ return ((tp_mask) 1)<<i;}
SI tp_mask shift_down(tp_mask x,int y){ return x>>y;}
diff --git a/sysdeps/x86_64/multiarch/gen_stub b/sysdeps/x86_64/multiarch/gen_stub
index da7cdf3..86324c0 100755
--- a/sysdeps/x86_64/multiarch/gen_stub
+++ b/sysdeps/x86_64/multiarch/gen_stub
@@ -4,9 +4,9 @@ TP=$2
ARG=$3
ARGN=$4
BASE=$5
-EXT=$6
-
-for I in sse2 ssse3 sse4_1; do
+TYPES=$6
+EXT=$7
+for I in $TYPES; do
F="${J}_${I}.c"
IU=`echo $I | tr '[a-z]' '[A-Z]'`
JU=`echo $J | tr '[a-z]' '[A-Z]'`
@@ -45,7 +45,7 @@ echo "
" >> "${J}.c"
-for I in sse2 ssse3 sse4_1; do
+for I in $TYPES; do
echo "extern ${TP} __${J}_${I}(${ARG}) attribute_hidden;" >> "${J}.c"
done
echo " ${TP} ${FN}(${ARG});
@@ -67,28 +67,29 @@ $TP ${FN}(${ARG}){
${ALIASED}
" >> "${J}.c"
- echo "sysdep_routines += ${J} ${J}_sse2 ${J}_ssse3 ${J}_sse4_1
- CFLAGS-${J}_ssse3.c += -mssse3
- CFLAGS-${J}_sse4_1.c += -msse4"
-
+ echo "sysdep_routines += \\"
+for I in $TYPES; do
+ echo "${J}_${I}\\"
+done
+ echo "${J}"
}
-#fn strlen "size_t" "const char* n" "n" strlen
-fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen
+#fn strlen "size_t" "const char* n" "n" strlen "sse2_no_bsf sse2"
+fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen "sse2_no_bsf sse2"
-fn strstr "char *" "const char* s,const char *n" "s,n" strstr
-fn strcasestr "char *" "const char* s,const char *n" "s,n" strstr ext
-fn memmem "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr
+fn strstr "char *" "const char* s,const char *n" "s,n" strstr "sse2_no_bsf sse2 ssse3"
+fn strcasestr "char *" "const char* s,const char *n" "s,n" strstr "sse2_no_bsf sse2 ssse3" ext
+fn memmem "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr "sse2_no_bsf sse2 ssse3"
-#fn strchr "char *" "const char* s,int c" "s,c" strchr
+#fn strchr "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2"
# fails because strch expands to builtin
-fn strrchr "char *" "const char* s,int c" "s,c" strchr
-fn strchrnul "char *" "const char* s,int c" "s,c" strchr ext
+fn strrchr "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2"
+fn strchrnul "char *" "const char* s,int c" "s,c" strchr "sse2_no_bsf sse2" ext
-fn memchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr
-fn rawmemchr "void *" "const void* s,int c" "s,c" strchr ext
-fn memrchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr ext
+fn memchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr "sse2_no_bsf sse2"
+fn rawmemchr "void *" "const void* s,int c" "s,c" strchr "sse2_no_bsf sse2" ext
+fn memrchr "void *" "const void* s,int c,size_t ss" "s,c,ss" strchr "sse2_no_bsf sse2" ext
echo "
#ifndef NO_ALIAS