This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.10-134-gaf263b8


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  af263b81541d1f4a10fc0862d0f3e3b9464534c1 (commit)
       via  ab6a873fe07b8ded403bc5a5ca73be5d04820d61 (commit)
      from  6cbbaa50aac809ad6e0692247876c82d58e466bf (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=af263b81541d1f4a10fc0862d0f3e3b9464534c1

commit af263b81541d1f4a10fc0862d0f3e3b9464534c1
Author: Ulrich Drepper <drepper@redhat.com>
Date:   Thu Jul 2 03:43:05 2009 -0700

    Whitespace fixes in last patch.

diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index bbc9979..9920b0e 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -178,7 +178,7 @@ STRCPY_SSSE3:
 LABEL(ashr_0):
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_aligned)
+	jbe	LABEL(strncpy_truncation_aligned)
 #endif
 	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
 	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
@@ -266,7 +266,7 @@ LABEL(ashr_15_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $15, (%rsi, %rcx), %xmm3
@@ -285,7 +285,7 @@ LABEL(ashr_15_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $15, (%rsi, %rcx), %xmm3
@@ -322,7 +322,7 @@ LABEL(ashr_14_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $14, (%rsi, %rcx), %xmm3
@@ -341,7 +341,7 @@ LABEL(ashr_14_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $14, (%rsi, %rcx), %xmm3
@@ -378,7 +378,7 @@ LABEL(ashr_13_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $13, (%rsi, %rcx), %xmm3
@@ -397,7 +397,7 @@ LABEL(ashr_13_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $13, (%rsi, %rcx), %xmm3
@@ -434,7 +434,7 @@ LABEL(ashr_12_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $12, (%rsi, %rcx), %xmm3
@@ -453,7 +453,7 @@ LABEL(ashr_12_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $12, (%rsi, %rcx), %xmm3
@@ -490,7 +490,7 @@ LABEL(ashr_11_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $11, (%rsi, %rcx), %xmm3
@@ -509,7 +509,7 @@ LABEL(ashr_11_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $11, (%rsi, %rcx), %xmm3
@@ -546,7 +546,7 @@ LABEL(ashr_10_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $10, (%rsi, %rcx), %xmm3
@@ -565,7 +565,7 @@ LABEL(ashr_10_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $10, (%rsi, %rcx), %xmm3
@@ -602,7 +602,7 @@ LABEL(ashr_9_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $9, (%rsi, %rcx), %xmm3
@@ -621,7 +621,7 @@ LABEL(ashr_9_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $9, (%rsi, %rcx), %xmm3
@@ -658,7 +658,7 @@ LABEL(ashr_8_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $8, (%rsi, %rcx), %xmm3
@@ -677,7 +677,7 @@ LABEL(ashr_8_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $8, (%rsi, %rcx), %xmm3
@@ -714,7 +714,7 @@ LABEL(ashr_7_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $7, (%rsi, %rcx), %xmm3
@@ -733,7 +733,7 @@ LABEL(ashr_7_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $7, (%rsi, %rcx), %xmm3
@@ -770,7 +770,7 @@ LABEL(ashr_6_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $6, (%rsi, %rcx), %xmm3
@@ -789,7 +789,7 @@ LABEL(ashr_6_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $6, (%rsi, %rcx), %xmm3
@@ -826,7 +826,7 @@ LABEL(ashr_5_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $5, (%rsi, %rcx), %xmm3
@@ -845,7 +845,7 @@ LABEL(ashr_5_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $5, (%rsi, %rcx), %xmm3
@@ -883,7 +883,7 @@ LABEL(ashr_4_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $4, (%rsi, %rcx), %xmm3
@@ -902,7 +902,7 @@ LABEL(ashr_4_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $4, (%rsi, %rcx), %xmm3
@@ -940,7 +940,7 @@ LABEL(ashr_3_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $3, (%rsi, %rcx), %xmm3
@@ -959,7 +959,7 @@ LABEL(ashr_3_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $3, (%rsi, %rcx), %xmm3
@@ -997,7 +997,7 @@ LABEL(ashr_2_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $2, (%rsi, %rcx), %xmm3
@@ -1016,7 +1016,7 @@ LABEL(ashr_2_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $2, (%rsi, %rcx), %xmm3
@@ -1054,7 +1054,7 @@ LABEL(ashr_1_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
 	palignr $1, (%rsi, %rcx), %xmm3
@@ -1072,7 +1072,7 @@ LABEL(ashr_1_use_ssse3):
 	jnz	LABEL(unaligned_exit)
 #ifdef USE_AS_STRNCPY
 	sub	$16, %r8
- 	jbe	LABEL(strncpy_truncation_unaligned)
+	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 	palignr $1, (%rsi, %rcx), %xmm3
 	movdqa  %xmm3, (%rdi, %rcx)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ab6a873fe07b8ded403bc5a5ca73be5d04820d61

commit ab6a873fe07b8ded403bc5a5ca73be5d04820d61
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Thu Jul 2 03:39:03 2009 -0700

    SSSE3 strcpy/stpcpy for x86-64
    
    This patch adds SSSE3 strcpy/stpcpy. I got up to 4X speed up on Core 2
    and Core i7.  I disabled it on Atom since SSSE3 version is slower for
    shorter (<64byte) data.

diff --git a/ChangeLog b/ChangeLog
index 4700e7d..b3c403d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2009-06-30  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* string/stpncpy.c (STPNCPY): New.  Defined if not defined.
+	(__stpncpy): Renamed to ...
+	(STPNCPY): This.
+	(stpncpy): Create alias only if STPNCPY is not defined.
+	* string/strncpy.c (STRNCPY): New.  Defined to strncpy if not
+	defined.
+	(strncpy): Renamed to ...
+	(STRNCPY): This.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	 stpncpy-c strncpy-c for string.
+	* sysdeps/x86_64/multiarch/stpcpy.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-c.c: New file.
+	* sysdeps/x86_64/multiarch/stpncpy.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy.S: New file.
+	* sysdeps/x86_64/multiarch/strncpy-c.c: New file.
+	* sysdeps/x86_64/multiarch/strncpy.S: New file.
+
 2009-07-02  Ulrich Drepper  <drepper@redhat.com>
 
 	* malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when
diff --git a/string/stpncpy.c b/string/stpncpy.c
index 164d0f1..2ebab33 100644
--- a/string/stpncpy.c
+++ b/string/stpncpy.c
@@ -28,17 +28,19 @@
 # include <sys/types.h>
 #endif
 
-#ifndef weak_alias
-# define __stpncpy stpncpy
+#ifndef STPNCPY
+# ifdef weak_alias
+#  define STPNCPY	__stpncpy
+weak_alias (__stpncpy, stpncpy)
+# else
+#  define STPNCPY	stpncpy
+# endif
 #endif
 
 /* Copy no more than N characters of SRC to DEST, returning the address of
    the terminating '\0' in DEST, if any, or else DEST + N.  */
 char *
-__stpncpy (dest, src, n)
-     char *dest;
-     const char *src;
-     size_t n;
+STPNCPY (char *dest, const char *src, size_t n)
 {
   char c;
   char *s = dest;
@@ -96,5 +98,4 @@ __stpncpy (dest, src, n)
 }
 #ifdef weak_alias
 libc_hidden_def (__stpncpy)
-weak_alias (__stpncpy, stpncpy)
 #endif
diff --git a/string/strncpy.c b/string/strncpy.c
index f32612e..2274d7d 100644
--- a/string/strncpy.c
+++ b/string/strncpy.c
@@ -21,11 +21,12 @@
 
 #undef strncpy
 
+#ifndef STRNCPY
+#define STRNCPY strncpy
+#endif
+
 char *
-strncpy (s1, s2, n)
-     char *s1;
-     const char *s2;
-     size_t n;
+STRNCPY (char *s1, const char *s2, size_t n)
 {
   reg_char c;
   char *s = s1;
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1c35e1f..127592a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,5 +4,5 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += strncmp-c
+sysdep_routines += stpncpy-c strncpy-c strncmp-c
 endif
diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S
new file mode 100644
index 0000000..b63d308
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
new file mode 100644
index 0000000..2fde77d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-c.c
@@ -0,0 +1,8 @@
+#define STPNCPY __stpncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
+#endif
+
+#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
new file mode 100644
index 0000000..ff89a89
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy.S
@@ -0,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
new file mode 100644
index 0000000..bbc9979
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -0,0 +1,1917 @@
+/* strcpy with SSSE3
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+#  define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__stpncpy_ssse3
+#  define STRCPY_SSE2	__stpncpy_sse2
+#  define __GI_STRCPY	__GI_stpncpy
+# else
+#  define STRCPY_SSSE3	__stpcpy_ssse3
+#  define STRCPY_SSE2	__stpcpy_sse2
+#  define __GI_STRCPY	__GI_stpcpy
+#  define __GI___STRCPY	__GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+#  define STRCPY_SSSE3	__strncpy_ssse3
+#  define STRCPY_SSE2	__strncpy_sse2
+#  define __GI_STRCPY	__GI_strncpy
+# else
+#  define STRCPY_SSSE3	__strcpy_ssse3
+#  define STRCPY_SSE2	__strcpy_sse2
+#  define __GI_STRCPY	__GI_strcpy
+# endif
+#endif
+
+#ifndef LABEL
+#define LABEL(l) L(l)
+#endif
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(STRCPY)
+	.type	STRCPY, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCPY_SSE2(%rip), %rax
+	testl	$(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	3f
+/* Avoid SSSE3 strcpy on Atom since it is slow.  */
+	cmpl	$1, __cpu_features+KIND_OFFSET(%rip)
+	jne	2f
+	cmpl	$6, __cpu_features+FAMILY_OFFSET(%rip)
+	jne	2f
+	cmpl	$28, __cpu_features+MODEL_OFFSET(%rip)
+	jz	3f
+2:	leaq	STRCPY_SSSE3(%rip), %rax
+3:	ret
+END(STRCPY)
+
+	.section .text.ssse3,"ax",@progbits
+STRCPY_SSSE3:
+	cfi_startproc
+	CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to copy up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCPY
+	test    %rdx, %rdx
+	jz      LABEL(strncpy_exitz)
+	mov     %rdx, %r8
+#else
+	xor	%edx, %edx
+#endif
+	mov	%esi, %ecx
+	and	$0xfffffffffffffff0, %rsi	/*force rsi 16 byte align*/
+	and	$15, %ecx
+	mov	%rdi, %rax			/*store return parameter*/
+
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0 */
+	pcmpeqb	(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx			/* move each byte mask of %xmm0 to edx*/
+	shr	%cl, %edx			/* get real bits left in edx*/
+	test	%edx, %edx			/* edx must be 0 if there is no null char from rsi+%rcx */
+	jnz	LABEL(less16bytes)
+
+#ifdef USE_AS_STRNCPY
+	lea	-16(%r8,%rcx), %r11
+	cmp	$0, %r11
+	jle	LABEL(less16bytes)		/* if r8 + rcx <= 16, branch to less16bytes.  */
+#endif
+
+	mov	%rcx, %r9
+	or	%edi, %ecx
+	and	$15, %ecx
+	lea	-16(%r9), %r10
+	jz	LABEL(ashr_0)			/* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
+
+	neg	%r10				/* store the rest in rsi aligned 16 bytes for unaligned_exit*/
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation*/
+	pcmpeqb	16(%rsi), %xmm0			/* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(less32bytes)
+	/*
+	* at least 16 byte available to fill destination rdi
+	*/
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(less32bytes_strncpy_truncation)
+#endif
+	mov	(%rsi, %r9), %rdx
+	mov	%rdx, (%rdi)
+	mov	8(%rsi, %r9), %rdx
+	mov	%rdx, 8(%rdi)
+
+	/*
+	* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
+	* crossponding case
+	* rcx is offset of rsi
+	* rax is offset of rdi
+	*/
+
+	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
+	mov	%rax, %rdx			/* rax store orignal rdi */
+	xor	%rdi, %rdx			/* equal to and $15, %rdx */
+#ifdef USE_AS_STRNCPY
+	add     %rdx, %r8
+#endif
+
+	add	$16, %rdi			/* next 16 bytes for rdi */
+	sub	%rdx, %r9
+
+	lea	16(%r9, %rsi), %rsi		/*re-calculate rsi by (16 - rdx)+ rcx */
+	mov	%esi, %ecx			/*store offset of rsi */
+	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
+
+	and	$15, %ecx			/* ecx must be 0 if rdx is equal to rcx*/
+	jz	LABEL(ashr_0)
+
+	lea	-16(%rcx), %r10
+	mov	%rcx, %r9
+	neg	%r10
+	lea	LABEL(unaligned_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+ /*
+ * The following cases will be handled by ashr_0 & ashr_0_start
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  corresponding case
+ *	0		    0		  0		 ashr_0
+ *	n(1~15)	     n(1~15)	   0		 ashr_0_start
+ *
+ */
+	.p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi), %xmm1	   /* fetch first 16 bytes from rsi */
+	movdqa  %xmm1, (%rdi)	   /* store first 16 bytes into rdi */
+	add     $16, %rsi
+	add     $16, %rdi
+	pcmpeqb  (%rsi), %xmm0		   /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
+	pmovmskb  %xmm0, %edx		   /* move each byte mask of %xmm0 to edx*/
+
+	test    %edx, %edx		  /* edx must be 0 if there is no null char in rsi*/
+	jnz	LABEL(aligned_16bytes)
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jz	LABEL(ashr_0_loop)
+
+	jmp	LABEL(aligned_exit)
+        .p2align 4
+
+/*
+ * The following cases will be handled by ashr_15
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(15)		n - 15		15((16 - (n -15) + n)%16	 ashr_15
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_15):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_15_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $15, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_15_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_14
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(14~15)		n - 14		14((16 - (n -14) + n)%16	 ashr_14
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_14):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_14_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $14, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_14_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_13
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(13~15)		n - 13		13((16 - (n -13) + n)%16	 ashr_13
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_13):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_13_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $13, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_13_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_12
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(12~15)		n - 12		12((16 - (n -12) + n)%16	 ashr_12
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_12):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_12_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $12, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_12_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_11
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(11~15)		n - 11		11((16 - (n -11) + n)%16	 ashr_11
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_11):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_11_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $11, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_11_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_10
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(10~15)		n - 10		10((16 - (n -10) + n)%16	 ashr_10
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_10):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_10_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $10, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_10_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_9
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(9~15)		n - 9		9((16 - (n -9) + n)%16	 ashr_9
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_9):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_9_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $9, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_9_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_8
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(8~15)		n - 8		8((16 - (n -8) + n)%16	 ashr_8
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_8):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_8_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $8, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_8_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_7
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(7~15)		n - 7		7((16 - (n -7) + n)%16	 ashr_7
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_7):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	.p2align 4
+
+LABEL(ashr_7_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $7, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_7_use_ssse3)
+
+/*
+ * The following cases will be handled by ashr_6
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(6~15)		n - 6		6((16 - (n -6) + n)%16	 ashr_6
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_6):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_6_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $6, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_6_use_ssse3)
+
+ /*
+ * The following cases will be handled by ashr_5
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(5~15)		n - 5		5((16 - (n -5) + n)%16	 ashr_5
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_5):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_5_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $5, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_5_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_4
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(4~15)		n - 4		4((16 - (n -4) + n)%16	 ashr_4
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_4):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_4_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $4, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_4_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_3
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(3~15)		n - 3		3((16 - (n -3) + n)%16	 ashr_3
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_3):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_3_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $3, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_3_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_2
+ *  rcx(offset of rsi)  rax(offset of rdi)	relative offset  	  corresponding case
+ *      n(2~15)		n - 2		2((16 - (n -2) + n)%16	 ashr_2
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_2):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_2_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $2, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_2_use_ssse3)
+
+/*
+ *
+ * The following cases will be handled by ashr_1
+ *  rcx(offset of rsi)  rax(offset of rdi)  relative offset  		corresponding case
+ *	n(1~15)		n - 1	   	1 ((16 - (n -1) + n)%16	 ashr_1
+ *
+ * Based on above operation , start from  (%r9 + rsi) to the left of this cache bank, there is no null byte
+ */
+	.p2align 4
+LABEL(ashr_1):
+	xor	%ecx, %ecx				/*clear ecx */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	.p2align 4
+LABEL(ashr_1_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+
+	movdqa  16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+	palignr $1, (%rsi, %rcx), %xmm3
+	movdqa  %xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_1_use_ssse3)
+
+	.p2align 4
+LABEL(less32bytes):
+	xor	%ecx, %ecx
+LABEL(unaligned_exit):
+	add	%r9, %rsi		/* r9 stores original offset of rsi*/
+	mov	%rcx, %r9
+	mov	%r10, %rcx
+	shl	%cl, %edx		/* after shl, calculate the exact number to be filled*/
+	mov	%r9, %rcx
+	.p2align 4
+LABEL(aligned_exit):
+	add	%rcx, %rdi		/*locate exact address for rdi */
+LABEL(less16bytes):
+	add	%rcx, %rsi		/*locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+	mov     $1, %r9d
+	lea     -1(%r8), %rcx
+	shl     %cl, %r9d
+	cmp     $32, %r8
+	ja      LABEL(strncpy_tail)
+	or      %r9d, %edx
+LABEL(strncpy_tail):
+#endif
+	bsf	%rdx, %rcx		/*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
+	lea	LABEL(tail_table)(%rip), %r11
+	movslq	(%r11, %rcx,4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(less32bytes_strncpy_truncation):
+	xor     %ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+	add      %r9, %rsi
+LABEL(strncpy_truncation_aligned):
+	add      %rcx, %rdi
+	add      %rcx, %rsi
+	add     $16, %r8
+	lea     -1(%r8), %rcx
+	lea     LABEL(tail_table)(%rip), %r11
+	movslq  (%r11, %rcx,4), %rcx
+	lea     (%r11, %rcx), %rcx
+	jmp     *%rcx
+	.p2align 4
+LABEL(strncpy_exitz):
+	mov     %rdi, %rax
+	ret
+#endif
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+LABEL(strncpy_fill_tail):
+	mov	%rax, %rdx
+	movzx	%cl, %rax
+	mov	%r8, %rcx
+	add	%rax, %rdi
+	xor	%eax, %eax
+	shr	$3, %ecx
+	jz	LABEL(strncpy_fill_less_8)
+
+	rep	stosq
+LABEL(strncpy_fill_less_8):
+	mov	%r8, %rcx
+	and	$7, %ecx
+	jz	LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+	sub	$1, %ecx
+	mov	%al, (%rdi, %rcx)
+	jnz	LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rdx)
+	sbb	$-1, %rdx
+#endif
+	mov	%rdx, %rax
+	ret
+#endif
+	.p2align 4
+LABEL(tail_0):
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+#ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$1, %cl
+	sub	$1, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_1):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$2, %cl
+	sub	$2, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_2):
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	1(%rsi), %cx
+	mov	%cx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$3, %cl
+	sub	$3, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_3):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$4, %cl
+	sub	$4, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_4):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	1(%rsi), %edx
+	mov	%edx, 1(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$5, %cl
+	sub	$5, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_5):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	2(%rsi), %edx
+	mov	%edx, 2(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$6, %cl
+	sub	$6, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_6):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	3(%rsi), %edx
+	mov	%edx,3(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$7, %cl
+	sub	$7, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_7):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+	lea	7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$8, %cl
+	sub	$8, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_8):
+
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %edx
+	mov	%edx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$9, %cl
+	sub	$9, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_9):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %edx
+	mov	%edx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$10, %cl
+	sub	$10, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_10):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %edx
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$11, %cl
+	sub	$11, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_11):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %edx
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$12, %cl
+	sub	$12, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_12):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %rcx
+	mov	%rcx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$13, %cl
+	sub	$13, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_13):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %rcx
+	mov	%rcx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$14, %cl
+	sub	$14, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_14):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %rcx
+	mov	%rcx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$15, %cl
+	sub	$15, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+LABEL(tail_15):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$16, %cl
+	sub	$16, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_16):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cl
+	mov	%cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$17, %cl
+	sub	$17, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_17):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cx
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$18, %cl
+	sub	$18, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_18):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %ecx
+	mov	%ecx,15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$19, %cl
+	sub	$19, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_19):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %ecx
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$20, %cl
+	sub	$20, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_20):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	13(%rsi), %rcx
+	mov	%rcx, 13(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$21, %cl
+	sub	$21, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_21):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	14(%rsi), %rcx
+	mov	%rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$22, %cl
+	sub	$22, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_22):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %rcx
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$23, %cl
+	sub	$23, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_23):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$24, %cl
+	sub	$24, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_24):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %edx
+	mov	%edx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$25, %cl
+	sub	$25, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_25):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %edx
+	mov	%edx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$26, %cl
+	sub	$26, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_26):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %edx
+	mov	%edx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$27, %cl
+	sub	$27, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_27):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %edx
+	mov	%edx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$28, %cl
+	sub	$28, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	.p2align 4
+LABEL(tail_28):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %rdx
+	mov	%rdx, 21(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$29, %cl
+	sub	$29, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+	.p2align 4
+LABEL(tail_29):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %rdx
+	mov	%rdx, 22(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$30, %cl
+	sub	$30, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+
+	ret
+
+
+	.p2align 4
+LABEL(tail_30):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %rdx
+	mov	%rdx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$31, %cl
+	sub	$31, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_31):
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %rdx
+	mov	%rdx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+	lea	31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRNCPY
+	mov	$32, %cl
+	sub	$32, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#ifdef USE_AS_STPCPY
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+#endif
+#endif
+	ret
+	cfi_endproc
+	.size	STRCPY_SSSE3, .-STRCPY_SSSE3
+
+	.p2align 4
+	.section .rodata.ssse3,"a",@progbits
+LABEL(tail_table):
+	.int	LABEL(tail_0) - LABEL(tail_table)
+	.int	LABEL(tail_1) - LABEL(tail_table)
+	.int	LABEL(tail_2) - LABEL(tail_table)
+	.int	LABEL(tail_3) - LABEL(tail_table)
+	.int	LABEL(tail_4) - LABEL(tail_table)
+	.int	LABEL(tail_5) - LABEL(tail_table)
+	.int	LABEL(tail_6) - LABEL(tail_table)
+	.int	LABEL(tail_7) - LABEL(tail_table)
+	.int	LABEL(tail_8) - LABEL(tail_table)
+	.int	LABEL(tail_9) - LABEL(tail_table)
+	.int	LABEL(tail_10) - LABEL(tail_table)
+	.int	LABEL(tail_11) - LABEL(tail_table)
+	.int	LABEL(tail_12) - LABEL(tail_table)
+	.int	LABEL(tail_13) - LABEL(tail_table)
+	.int	LABEL(tail_14) - LABEL(tail_table)
+	.int	LABEL(tail_15) - LABEL(tail_table)
+	.int	LABEL(tail_16) - LABEL(tail_table)
+	.int	LABEL(tail_17) - LABEL(tail_table)
+	.int	LABEL(tail_18) - LABEL(tail_table)
+	.int	LABEL(tail_19) - LABEL(tail_table)
+	.int	LABEL(tail_20) - LABEL(tail_table)
+	.int	LABEL(tail_21) - LABEL(tail_table)
+	.int	LABEL(tail_22) - LABEL(tail_table)
+	.int	LABEL(tail_23) - LABEL(tail_table)
+	.int	LABEL(tail_24) - LABEL(tail_table)
+	.int	LABEL(tail_25) - LABEL(tail_table)
+	.int	LABEL(tail_26) - LABEL(tail_table)
+	.int	LABEL(tail_27) - LABEL(tail_table)
+	.int	LABEL(tail_28) - LABEL(tail_table)
+	.int	LABEL(tail_29) - LABEL(tail_table)
+	.int	LABEL(tail_30) - LABEL(tail_table)
+	.int	LABEL(tail_31) - LABEL(tail_table)
+
+	.p2align 4
+LABEL(unaligned_table):
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCPY_SSE2, @function; \
+	STRCPY_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
new file mode 100644
index 0000000..296c32c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_sse2
+#ifdef SHARED
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
+#endif
+
+#include "strncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
new file mode 100644
index 0000000..327a4ce
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy.S
@@ -0,0 +1,3 @@
+#define STRCPY strncpy
+#define USE_AS_STRNCPY
+#include "strcpy.S"

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                            |   19 +
 string/stpncpy.c                     |   15 +-
 string/strncpy.c                     |    9 +-
 sysdeps/x86_64/multiarch/Makefile    |    2 +-
 sysdeps/x86_64/multiarch/stpcpy.S    |    7 +
 sysdeps/x86_64/multiarch/stpncpy-c.c |    8 +
 sysdeps/x86_64/multiarch/stpncpy.S   |    6 +
 sysdeps/x86_64/multiarch/strcpy.S    | 1917 ++++++++++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/strncpy-c.c |    8 +
 sysdeps/x86_64/multiarch/strncpy.S   |    3 +
 10 files changed, 1982 insertions(+), 12 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/stpcpy.S
 create mode 100644 sysdeps/x86_64/multiarch/stpncpy-c.c
 create mode 100644 sysdeps/x86_64/multiarch/stpncpy.S
 create mode 100644 sysdeps/x86_64/multiarch/strcpy.S
 create mode 100644 sysdeps/x86_64/multiarch/strncpy-c.c
 create mode 100644 sysdeps/x86_64/multiarch/strncpy.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]