This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] vectorized string functions


On Wed, Jul 11, 2012 at 06:34:11PM +0400, Dmitrieva Liubov wrote:
> Ondrej,
> 
> >> +sysdep_routines += strnlen strnlen_sse2 strnlen_ssse3 strnlen_sse4_1
> >> +  CFLAGS-strnlen_ssse3.c  += -mssse3
> >> +  CFLAGS-strnlen_sse4_1.c  += -msse4
> 
> It seems to me that sometimes you produces too many versions.

Ok, I revised generation to write only versions that currently make
difference.


> 
> Strnlen example:
> Objdump shows strnlen_sse2 and strnlen_ssse3 are exactly the same. (No
> any SSSE3 instruction GCC compiler generates)
> strnlen_sse4_1 differs from others only with ptest instead of pmovmskb
> + testl pair but it's known that this almost no affect performance but
> we've got IFUNC wrapper overhead.

I would expect that more impact would could be by
ptest being one byte less than pmovmskb + testl

> 
> >> delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
> 
> And we should check regressions on atom machine before removing no_bsf
> atom specific version.

I could add atom specific bsf implementation. Only first_bit function
needs to be changed. Do you know a fast implementation? 

My current idea is first compute t^(t-1) to get 1 for 
precisely trailing zeros and then compute index by multiplication and
array lookup.

diff --git a/sysdeps/x86_64/sse.h b/sysdeps/x86_64/sse.h
index 8db6d81..d77e56a 100644
--- a/sysdeps/x86_64/sse.h
+++ b/sysdeps/x86_64/sse.h
@@ -24,7 +24,14 @@ typedef unsigned long tp_mask;
 SI tp_mask get_mask(tp_vector x){  return  (tp_mask)((unsigned int)_mm_movemask_epi8(x)); }
 SI unsigned int NONZERO_MASK(tp_vector x){ return _HAS_SSE4_1(!_mm_testz_si128(x,x),get_mask(x));         }
 
-SI tp_mask first_bit(tp_mask t,int y){ return __builtin_ctzl(t);}
+SI tp_mask first_bit(tp_mask t,int y){
+#ifdef USE_SSE2_NO_BSF
+ tp_mask ones=t^(t-1); 
+ return tab[(c*ones)>>56];
+#else
+ return __builtin_ctzl(t);
+#endif
+}
 
 SI tp_mask bit_i(int i){            return ((tp_mask) 1)<<i;}
 SI tp_mask shift_down(tp_mask x,int y){ return x>>y;}
diff --git a/sysdeps/x86_64/multiarch/gen_stub b/sysdeps/x86_64/multiarch/gen_stub
index da7cdf3..86324c0 100755
--- a/sysdeps/x86_64/multiarch/gen_stub
+++ b/sysdeps/x86_64/multiarch/gen_stub
@@ -4,9 +4,9 @@ TP=$2
 ARG=$3
 ARGN=$4
 BASE=$5
-EXT=$6
-
-for I in sse2 ssse3 sse4_1; do
+TYPES=$6
+EXT=$7
+for I in $TYPES; do
 F="${J}_${I}.c"
 IU=`echo $I | tr '[a-z]' '[A-Z]'`
 JU=`echo $J | tr '[a-z]' '[A-Z]'`
@@ -45,7 +45,7 @@ echo "
 
 " >> "${J}.c"
 
-for I in sse2 ssse3 sse4_1; do
+for I in $TYPES; do
   echo "extern ${TP} __${J}_${I}(${ARG}) attribute_hidden;" >> "${J}.c"
 done
 echo " ${TP} ${FN}(${ARG});
@@ -67,28 +67,29 @@ $TP ${FN}(${ARG}){
 ${ALIASED}
 " >> "${J}.c"
 
-  echo "sysdep_routines += ${J} ${J}_sse2 ${J}_ssse3 ${J}_sse4_1
-  CFLAGS-${J}_ssse3.c  += -mssse3
-  CFLAGS-${J}_sse4_1.c  += -msse4"
-
+  echo "sysdep_routines += \\"
+for I in $TYPES; do
+  echo "${J}_${I}\\"
+done
+  echo "${J}"
 }
-#fn strlen "size_t" "const char* n" "n"               strlen
-fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen 
+#fn strlen "size_t" "const char* n" "n"              strlen "sse2_no_bsf sse2"
+fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen "sse2_no_bsf sse2"
 
 
-fn strstr  "char *" "const char* s,const char *n"                      "s,n"       strstr
-fn strcasestr  "char *" "const char* s,const char *n"                  "s,n"       strstr ext
-fn memmem  "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr
+fn strstr  "char *" "const char* s,const char *n"                      "s,n"       strstr "sse2_no_bsf sse2 ssse3"
+fn strcasestr  "char *" "const char* s,const char *n"                  "s,n"       strstr "sse2_no_bsf sse2 ssse3" ext
+fn memmem  "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr "sse2_no_bsf sse2 ssse3"
 
-#fn strchr   "char *" "const char* s,int c" "s,c"  strchr
+#fn strchr   "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2"
 # fails because strch expands to builtin
 
-fn strrchr   "char *" "const char* s,int c" "s,c"  strchr 
-fn strchrnul "char *" "const char* s,int c" "s,c"  strchr ext
+fn strrchr   "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2"
+fn strchrnul "char *" "const char* s,int c" "s,c"  strchr "sse2_no_bsf sse2" ext
 
-fn memchr    "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  
-fn rawmemchr "void *" "const void* s,int c" "s,c"  strchr ext
-fn memrchr   "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  ext
+fn memchr    "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  "sse2_no_bsf sse2" 
+fn rawmemchr "void *" "const void* s,int c" "s,c"               strchr  "sse2_no_bsf sse2" ext
+fn memrchr   "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  "sse2_no_bsf sse2" ext
 
 echo "
 #ifndef NO_ALIAS


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]