This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

PATCH: Add SSE4 strcspn/strpbrk/strspn


Hi,

This patch adds strcspn/strpbrk/strspn with SSE4 intrinsics if gcc
supports -msse4. They get up to 60x speed up with length of accept/reject
string <= 16 on Intel Core i7.

Thanks.


H.J.
---
2009-07-02  H.J. Lu  <hongjiu.lu@intel.com>

	* config.h.in (HAVE_SSE4_SUPPORT): New.

	* config.make.in (config-cflags-sse4): New.

	* configure.in: Substitute libc_cv_cc_sse4.
	* configure: Regenerated.

	* sysdeps/i386/configure.in: Set libc_cv_cc_sse4 and
	HAVE_SSE4_SUPPORT.
	* sysdeps/i386/configure: Regenerated.

	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	strcspn-c strpbrk-c strspn-c for string if gcc supports SSE4.

	* sysdeps/x86_64/multiarch/strcspn-c.c: New.
	* sysdeps/x86_64/multiarch/strcspn.S: Likewise.
	* sysdeps/x86_64/multiarch/strpbrk-c.c: Likewise.
	* sysdeps/x86_64/multiarch/strpbrk.S: Likewise.
	* sysdeps/x86_64/multiarch/strspn-c.c: Likewise.
	* sysdeps/x86_64/multiarch/strspn.S: Likewise.

diff --git a/config.h.in b/config.h.in
index 8dbc224..4ddab7d 100644
--- a/config.h.in
+++ b/config.h.in
@@ -129,6 +129,9 @@
 /* Define if binutils support TLS handling.  */
 #undef	HAVE_TLS_SUPPORT
 
+/* Define if gcc supports SSE4.  */
+#undef	HAVE_SSE4_SUPPORT
+
 /* Define if the compiler's exception support is based on libunwind.  */
 #undef	HAVE_CC_WITH_LIBUNWIND
 
diff --git a/config.make.in b/config.make.in
index e48ea26..5fb5c81 100644
--- a/config.make.in
+++ b/config.make.in
@@ -34,6 +34,8 @@ config-sysdirs = @sysnames@
 cflags-cpu = @libc_cv_cc_submachine@
 asflags-cpu = @libc_cv_cc_submachine@
 
+config-cflags-sse4 = @libc_cv_cc_sse4@
+
 defines = @DEFINES@
 sysincludes = @SYSINCLUDES@
 c++-sysincludes = @CXX_SYSINCLUDES@
diff --git a/configure b/configure
index 88cf4fd..e30778f 100755
--- a/configure
+++ b/configure
@@ -657,6 +657,7 @@ xcoff
 elf
 ldd_rewrite_script
 use_ldconfig
+libc_cv_cc_sse4
 libc_cv_cpp_asm_debuginfo
 libc_cv_forced_unwind
 libc_cv_rootsbindir
@@ -8744,6 +8745,7 @@ fi
 
 
 
+
 if test $elf = yes; then
   cat >>confdefs.h <<\_ACEOF
 #define HAVE_ELF 1
diff --git a/configure.in b/configure.in
index 6a92bd8..216cdc9 100644
--- a/configure.in
+++ b/configure.in
@@ -2259,6 +2259,7 @@ AC_SUBST(libc_cv_forced_unwind)
 
 dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests
 AC_SUBST(libc_cv_cpp_asm_debuginfo)
+AC_SUBST(libc_cv_cc_sse4)
 
 AC_SUBST(use_ldconfig)
 AC_SUBST(ldd_rewrite_script)
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index d1d4dc1..cbc8cd9 100755
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -1,10 +1,42 @@
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
 # This file is generated from configure.in by Autoconf.  DO NOT EDIT!
  # Local configure fragment for sysdeps/i386.
 
-echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
-echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6
+{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
+$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; }
 if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
+  $as_echo_n "(cached) " >&6
 else
   cat > conftest.S <<EOF
 #include "confdefs.h"
@@ -27,7 +59,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
   { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
   (eval $ac_try) 2>&5
   ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; } && {
    ac_pattern='conftest\.S'
    { ac_try='readelf --debug-dump=line conftest.o |
@@ -35,7 +67,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
   { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
   (eval $ac_try) 2>&5
   ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; }
   }; then
   libc_cv_cpp_asm_debuginfo=yes
@@ -44,11 +76,36 @@ else
 fi
 rm -f conftest*
 fi
-echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
-echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
+$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; }
 if test $libc_cv_cpp_asm_debuginfo = yes; then
   cat >>confdefs.h <<\_ACEOF
 #define HAVE_CPP_ASM_DEBUGINFO 1
 _ACEOF
 
 fi
+
+{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5
+$as_echo_n "checking for SSE4 support... " >&6; }
+if test "${libc_cv_cc_sse4+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  libc_cv_cc_sse4=yes
+else
+  libc_cv_cc_sse4=no
+fi
+fi
+{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5
+$as_echo "$libc_cv_cc_sse4" >&6; }
+if test $libc_cv_cc_sse4 = yes; then
+  cat >>confdefs.h <<\_ACEOF
+#define HAVE_SSE4_SUPPORT 1
+_ACEOF
+
+fi
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
index 028e1ae..44f53a5 100644
--- a/sysdeps/i386/configure.in
+++ b/sysdeps/i386/configure.in
@@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo)
 if test $libc_cv_cpp_asm_debuginfo = yes; then
   AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO)
 fi
+
+dnl Check if -msse4 works.
+AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl
+if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then
+  libc_cv_cc_sse4=yes
+else
+  libc_cv_cc_sse4=no
+fi])
+if test $libc_cv_cc_sse4 = yes; then
+  AC_DEFINE(HAVE_SSE4_SUPPORT)
+fi
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 127592a..71e85f0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,4 +5,10 @@ endif
 
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strncmp-c
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+endif
 endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
new file mode 100644
index 0000000..735ee8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -0,0 +1,331 @@
+/* strcspn with SSE4 intrinsics
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x2:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_POSITIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+   
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any byte A and
+   the offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte A at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte A.
+   3. The first 16byte data element is valid and doesn't have the byte A.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+#ifndef STRCSPN_SSE2
+#define STRCSPN_SSE2 __strcspn_sse2
+#define STRCSPN_SSE42 __strcspn_sse42
+#endif
+
+extern
+#ifdef USE_AS_STRPBRK
+char * 
+#else
+size_t
+#endif
+STRCSPN_SSE2 (const char *, const char *);
+
+#ifdef USE_AS_STRPBRK
+char *
+#else
+size_t
+#endif
+__attribute__ ((section (".text.sse4.2")))
+STRCSPN_SSE42 (const char *s, const char *a)
+{
+  int offset;
+  const char *aligned;
+  __m128i mask, mask0, mask1;
+  __m128i value;
+  int index, length;
+  int cflag, zflag;
+
+  if (*a == 0)
+#ifdef USE_AS_STRPBRK
+    return NULL;
+#else
+    return strlen (s);
+#endif
+
+  offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  mask = _mm_srli_si128 (mask0, 1);
+	  break;
+	case 2:
+	  mask = _mm_srli_si128 (mask0, 2);
+	  break;
+	case 3:
+	  mask = _mm_srli_si128 (mask0, 3);
+	  break;
+	case 4:
+	  mask = _mm_srli_si128 (mask0, 4);
+	  break;
+	case 5:
+	  mask = _mm_srli_si128 (mask0, 5);
+	  break;
+	case 6:
+	  mask = _mm_srli_si128 (mask0, 6);
+	  break;
+	case 7:
+	  mask = _mm_srli_si128 (mask0, 7);
+	  break;
+	case 8:
+	  mask = _mm_srli_si128 (mask0, 8);
+	  break;
+	case 9:
+	  mask = _mm_srli_si128 (mask0, 9);
+	  break;
+	case 10:
+	  mask = _mm_srli_si128 (mask0, 10);
+	  break;
+	case 11:
+	  mask = _mm_srli_si128 (mask0, 11);
+	  break;
+	case 12:
+	  mask = _mm_srli_si128 (mask0, 12);
+	  break;
+	case 13:
+	  mask = _mm_srli_si128 (mask0, 13);
+	  break;
+	case 14:
+	  mask = _mm_srli_si128 (mask0, 14);
+	  break;
+	case 15:
+	  mask = _mm_srli_si128 (mask0, 15);
+	  break;
+	}
+
+      /* Find where the NULL terminator is.  */
+      length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4 if the length of A > 16.  */
+	  if (length > 16)
+	    return STRCSPN_SSE2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  */
+	      switch (offset)
+		{
+		case 1:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 1);
+		  break;
+		case 2:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 2);
+		  break;
+		case 3:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 3);
+		  break;
+		case 4:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 4);
+		  break;
+		case 5:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 5);
+		  break;
+		case 6:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 6);
+		  break;
+		case 7:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 7);
+		  break;
+		case 8:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 8);
+		  break;
+		case 9:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 9);
+		  break;
+		case 10:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 10);
+		  break;
+		case 11:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 11);
+		  break;
+		case 12:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 12);
+		  break;
+		case 13:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 13);
+		  break;
+		case 14:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 14);
+		  break;
+		case 15:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 15);
+		  break;
+		}
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return STRCSPN_SSE2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      value = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  value = _mm_srli_si128 (value, 1);
+	  break;
+	case 2:
+	  value = _mm_srli_si128 (value, 2);
+	  break;
+	case 3:
+	  value = _mm_srli_si128 (value, 3);
+	  break;
+	case 4:
+	  value = _mm_srli_si128 (value, 4);
+	  break;
+	case 5:
+	  value = _mm_srli_si128 (value, 5);
+	  break;
+	case 6:
+	  value = _mm_srli_si128 (value, 6);
+	  break;
+	case 7:
+	  value = _mm_srli_si128 (value, 7);
+	  break;
+	case 8:
+	  value = _mm_srli_si128 (value, 8);
+	  break;
+	case 9:
+	  value = _mm_srli_si128 (value, 9);
+	  break;
+	case 10:
+	  value = _mm_srli_si128 (value, 10);
+	  break;
+	case 11:
+	  value = _mm_srli_si128 (value, 11);
+	  break;
+	case 12:
+	  value = _mm_srli_si128 (value, 12);
+	  break;
+	case 13:
+	  value = _mm_srli_si128 (value, 13);
+	  break;
+	case 14:
+	  value = _mm_srli_si128 (value, 14);
+	  break;
+	case 15:
+	  value = _mm_srli_si128 (value, 15);
+	  break;
+	}
+
+      length = _mm_cmpistri (mask, value, 0x2);
+      /* No need to check ZFlag since ZFlag is always 1.  */
+      cflag = _mm_cmpistrc (mask, value, 0x2);
+      if (cflag)
+#ifdef USE_AS_STRPBRK
+	return (char *) (s + length);
+#else
+	return length;
+#endif
+      /* Find where the NULL terminator is.  */
+      index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+#ifdef USE_AS_STRPBRK
+	return NULL;
+#else
+	return index;
+#endif
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+loop:
+  value = _mm_load_si128 ((__m128i *) aligned);
+  index = _mm_cmpistri (mask, value, 0x2);
+  cflag = _mm_cmpistrc (mask, value, 0x2);
+  zflag = _mm_cmpistrz (mask, value, 0x2);
+  if (cflag)
+#ifdef USE_AS_STRPBRK
+    return (char *) (aligned + index);
+#else
+    return (size_t) (aligned + index - s);
+#endif
+  if (zflag)
+#ifdef USE_AS_STRPBRK
+    return NULL;
+#else
+    {
+      /* Find where the NULL terminator is.  */
+      index = _mm_cmpistri (value, value, 0x3a);
+      return (size_t) (aligned + index - s);
+    }
+#endif
+  aligned += 16;
+  goto loop;
+}
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
new file mode 100644
index 0000000..2e0a4e6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -0,0 +1,81 @@
+/* Multiple versions of strcspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42	__strpbrk_sse42
+#define STRCSPN_SSE2	__strpbrk_sse2
+#define __GI_STRCSPN	__GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN		strcspn
+#define STRCSPN_SSE42	__strcspn_sse42
+#define STRCSPN_SSE2	__strcspn_sse2
+#define __GI_STRCSPN	__GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc.  Don't
+   define multiple versions for strpbrk in static library since we
+   need strpbrk before the initialization happened.  */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+	.text
+ENTRY(STRCSPN)
+	.type	STRCSPN, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCSPN_SSE2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	STRCSPN_SSE42(%rip), %rax
+2:	ret
+END(STRCSPN)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCSPN_SSE2, @function; \
+	.globl STRCSPN_SSE2; \
+	STRCSPN_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../strpbrk.S"
+#else
+#include "../strcspn.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
new file mode 100644
index 0000000..c58dcb5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk-c.c
@@ -0,0 +1,4 @@
+#define USE_AS_STRPBRK
+#define STRCSPN_SSE2 __strpbrk_sse2
+#define STRCSPN_SSE42 __strpbrk_sse42
+#include "strcspn-c.c"
diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S
new file mode 100644
index 0000000..ed5bca6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk.S
@@ -0,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
new file mode 100644
index 0000000..75a048e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -0,0 +1,287 @@
+/* strspn with SSE4 intrinsics
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <nmmintrin.h>
+#include <string.h>
+
+/* We use 0x12:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_ANY
+	| _SIDD_NEGATIVE_POLARITY
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+   
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   A A A A A A A A A A A A A A A A
+
+   to find out if the first 16byte data element has any non-A byte and
+   the offset of the first byte.  There are 2 cases:
+
+   1. The first 16byte data element has the non-A byte, including
+      EOS, at the offset X.
+   2. The first 16byte data element is valid and doesn't have the non-A
+      byte.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  0	  0
+
+   We exit from the loop for case 1.  */
+
+extern size_t __strspn_sse2 (const char *, const char *);
+
+size_t
+__attribute__ ((section (".text.sse4.2")))
+__strspn_sse42 (const char *s, const char *a)
+{
+  int offset;
+  const char *aligned;
+  __m128i mask, mask0, mask1;
+  __m128i value;
+  int index, length;
+  int cflag;
+
+  if (*a == 0)
+    return 0;
+
+  offset = (int) ((size_t) a & 15);
+  if (offset != 0)
+    {
+      /* Load masks.  */
+      aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+      mask0 = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  mask = _mm_srli_si128 (mask0, 1);
+	  break;
+	case 2:
+	  mask = _mm_srli_si128 (mask0, 2);
+	  break;
+	case 3:
+	  mask = _mm_srli_si128 (mask0, 3);
+	  break;
+	case 4:
+	  mask = _mm_srli_si128 (mask0, 4);
+	  break;
+	case 5:
+	  mask = _mm_srli_si128 (mask0, 5);
+	  break;
+	case 6:
+	  mask = _mm_srli_si128 (mask0, 6);
+	  break;
+	case 7:
+	  mask = _mm_srli_si128 (mask0, 7);
+	  break;
+	case 8:
+	  mask = _mm_srli_si128 (mask0, 8);
+	  break;
+	case 9:
+	  mask = _mm_srli_si128 (mask0, 9);
+	  break;
+	case 10:
+	  mask = _mm_srli_si128 (mask0, 10);
+	  break;
+	case 11:
+	  mask = _mm_srli_si128 (mask0, 11);
+	  break;
+	case 12:
+	  mask = _mm_srli_si128 (mask0, 12);
+	  break;
+	case 13:
+	  mask = _mm_srli_si128 (mask0, 13);
+	  break;
+	case 14:
+	  mask = _mm_srli_si128 (mask0, 14);
+	  break;
+	case 15:
+	  mask = _mm_srli_si128 (mask0, 15);
+	  break;
+	}
+
+      /* Find where the NULL terminator is.  */
+      length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16 - offset)
+	{
+	  /* There is no NULL terminator.  */
+	  mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+	  index = _mm_cmpistri (mask1, mask1, 0x3a);
+	  length += index;
+
+	  /* Don't use SSE4 if the length of A > 16.  */
+	  if (length > 16)
+	    return __strspn_sse2 (s, a);
+
+	  if (index != 0)
+	    {
+	      /* Combine mask0 and mask1.  */
+	      switch (offset)
+		{
+		case 1:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 1);
+		  break;
+		case 2:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 2);
+		  break;
+		case 3:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 3);
+		  break;
+		case 4:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 4);
+		  break;
+		case 5:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 5);
+		  break;
+		case 6:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 6);
+		  break;
+		case 7:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 7);
+		  break;
+		case 8:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 8);
+		  break;
+		case 9:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 9);
+		  break;
+		case 10:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 10);
+		  break;
+		case 11:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 11);
+		  break;
+		case 12:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 12);
+		  break;
+		case 13:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 13);
+		  break;
+		case 14:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 14);
+		  break;
+		case 15:
+		  mask = _mm_alignr_epi8 (mask1, mask0, 15);
+		  break;
+		}
+	    }
+	}
+    }
+  else
+    {
+      /* A is aligned.  */
+      mask = _mm_load_si128 ((__m128i *) a);
+
+      /* Find where the NULL terminator is.  */
+      length = _mm_cmpistri (mask, mask, 0x3a);
+      if (length == 16)
+	{
+	  /* There is no NULL terminator.  Don't use SSE4 if the length
+	     of A > 16.  */
+	  if (a[16] != 0)
+	    return __strspn_sse2 (s, a);
+	}
+    }
+
+  offset = (int) ((size_t) s & 15);
+  if (offset != 0)
+    {
+      /* Check partial string.  */
+      aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+      value = _mm_load_si128 ((__m128i *) aligned);
+
+      switch (offset)
+	{
+	case 1:
+	  value = _mm_srli_si128 (value, 1);
+	  break;
+	case 2:
+	  value = _mm_srli_si128 (value, 2);
+	  break;
+	case 3:
+	  value = _mm_srli_si128 (value, 3);
+	  break;
+	case 4:
+	  value = _mm_srli_si128 (value, 4);
+	  break;
+	case 5:
+	  value = _mm_srli_si128 (value, 5);
+	  break;
+	case 6:
+	  value = _mm_srli_si128 (value, 6);
+	  break;
+	case 7:
+	  value = _mm_srli_si128 (value, 7);
+	  break;
+	case 8:
+	  value = _mm_srli_si128 (value, 8);
+	  break;
+	case 9:
+	  value = _mm_srli_si128 (value, 9);
+	  break;
+	case 10:
+	  value = _mm_srli_si128 (value, 10);
+	  break;
+	case 11:
+	  value = _mm_srli_si128 (value, 11);
+	  break;
+	case 12:
+	  value = _mm_srli_si128 (value, 12);
+	  break;
+	case 13:
+	  value = _mm_srli_si128 (value, 13);
+	  break;
+	case 14:
+	  value = _mm_srli_si128 (value, 14);
+	  break;
+	case 15:
+	  value = _mm_srli_si128 (value, 15);
+	  break;
+	}
+
+      length = _mm_cmpistri (mask, value, 0x12);
+      /* No need to check CFlag since it is always 1.  */
+      if (length < 16 - offset)
+	return length;
+      /* Find where the NULL terminator is.  */
+      index = _mm_cmpistri (value, value, 0x3a);
+      if (index < 16 - offset)
+	return length;
+      aligned += 16;
+    }
+  else
+    aligned = s;
+
+loop:
+  value = _mm_load_si128 ((__m128i *) aligned);
+  index = _mm_cmpistri (mask, value, 0x12);
+  cflag = _mm_cmpistrc (mask, value, 0x12);
+  if (cflag)
+    return (size_t) (aligned + index - s);
+  aligned += 16;
+  goto loop;
+}
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
new file mode 100644
index 0000000..6ac5167
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -0,0 +1,62 @@
+/* Multiple versions of strspn
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(strspn)
+	.type	strspn, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strspn_sse2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	__strspn_sse42(%rip), %rax
+2:	ret
+END(strspn)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strspn_sse2, @function; \
+	.globl __strspn_sse2; \
+	__strspn_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strspn calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strspn; __GI_strspn = __strspn_sse2
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../strspn.S"


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]