This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.14-115-g8002999


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  80029994814f0718aef9568c90f04b3d9a31802c (commit)
       via  99710781cc47002612e609c7dc5f34692b64e9b3 (commit)
      from  7dc6bd90c569c49807462b0740b18e32fab4d8b7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=80029994814f0718aef9568c90f04b3d9a31802c

commit 80029994814f0718aef9568c90f04b3d9a31802c
Author: Ulrich Drepper <drepper@gmail.com>
Date:   Tue Jul 19 17:27:09 2011 -0400

    Fix whitespaces

diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 1150281..c02c6f0 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -52,4 +52,3 @@ L(StartStrcpyPart):
 
 # include "strcpy-sse2-unaligned.S"
 #endif
-
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index 66736a7..34b61b8 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -556,4 +556,3 @@ L(StrncatExit8Bytes):
 # endif
 END (STRCAT)
 #endif
-
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 6de8c47..e73778a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -1742,7 +1742,7 @@ L(Unaligned64LeaveCase2):
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
 #  else
-        jnz	L(CopyFrom1To16Bytes)
+	jnz	L(CopyFrom1To16Bytes)
 #  endif
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb %xmm0, %rdx
@@ -1754,7 +1754,7 @@ L(Unaligned64LeaveCase2):
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
 #  else
-        jnz	L(CopyFrom1To16Bytes)
+	jnz	L(CopyFrom1To16Bytes)
 #  endif
 
 	pcmpeqb	%xmm6, %xmm0
@@ -1767,7 +1767,7 @@ L(Unaligned64LeaveCase2):
 #  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
 #  else
-        jnz	L(CopyFrom1To16Bytes)
+	jnz	L(CopyFrom1To16Bytes)
 #  endif
 
 	pcmpeqb	%xmm7, %xmm0
@@ -1888,4 +1888,3 @@ L(FillTable):
 #  endif
 # endif
 #endif
-
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
index 57778cf..6048072 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
@@ -141,7 +141,7 @@ L(align16_start):
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
-	
+
 
 	test	$0x3f, %rax
 	jz	L(align64_loop)
@@ -183,10 +183,10 @@ L(align16_start):
 	.p2align 4
 	L(align64_loop):
 	movaps	(%rax),	%xmm4
-	pminub	16(%rax), 	%xmm4
-	movaps	32(%rax), 	%xmm5
-	pminub	48(%rax), 	%xmm5
-	add	$64, 	%rax
+	pminub	16(%rax),	%xmm4
+	movaps	32(%rax),	%xmm5
+	pminub	48(%rax),	%xmm5
+	add	$64,	%rax
 	pminub	%xmm4,	%xmm5
 	pcmpeqb	%xmm0,	%xmm5
 	pmovmskb %xmm5,	%edx
@@ -195,7 +195,7 @@ L(align16_start):
 
 
 	pcmpeqb	-64(%rax), %xmm0
-	sub	$80, 	%rax
+	sub	$80,	%rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=99710781cc47002612e609c7dc5f34692b64e9b3

commit 99710781cc47002612e609c7dc5f34692b64e9b3
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date:   Tue Jul 19 17:11:54 2011 -0400

    Improve 64 bit strcat functions with SSE2/SSSE3

diff --git a/ChangeLog b/ChangeLog
index 0932ae5..e3dc2ee 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2011-07-15  Liubov Dmitrieva  <liubov.dmitrieva@intel.com>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strcat-ssse3 strcat-sse2-unaligned strncat-ssse3
+	strncat-sse2-unaligned strncat-c strlen-sse2-pminub
+	* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strcat.S: New file.
+	* sysdeps/x86_64/multiarch/strncat.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-c.c: New file.
+	* sysdeps/x86_64/multiarch/strcat-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/strncat-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S
+	(USE_AS_STRCAT): Define.
+	Add strcat and strncat support.
+	* sysdeps/x86_64/multiarch/strlen-no-bsf.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: New file.
+	* string/strncat.c: Update.
+	(USE_AS_STRNCAT): Define.
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+	Turn on bit_Prefer_PMINUB_for_stringop for Intel Core i3, i5
+	and i7.
+	* sysdeps/x86_64/multiarch/init-arch.h
+	(bit_Prefer_PMINUB_for_stringop): New.
+	(index_Prefer_PMINUB_for_stringop): Likewise.
+	* sysdeps/x86_64/multiarch/strlen.S (strlen): Check
+	bit_Prefer_PMINUB_for_stringop.
+
 2011-07-19  Ulrich Drepper  <drepper@gmail.com>
 
 	* crypt/sha512.h (struct sha512_ctx): Move buffer into union and add
diff --git a/NEWS b/NEWS
index f3cead3..fb2c15e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes.  2011-7-6
+GNU C Library NEWS -- history of user-visible changes.  2011-7-19
 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
 See the end for copying conditions.
 
@@ -23,6 +23,9 @@ Version 2.15
 
 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
   Contributed by HJ Lu.
+
+* Improved strcat and strncat on x86-64.
+  Contributed by Liubov Dmitrieva.
 
 Version 2.14
 
diff --git a/string/strncat.c b/string/strncat.c
index 2e2de11..72d9d69 100644
--- a/string/strncat.c
+++ b/string/strncat.c
@@ -24,10 +24,12 @@
 typedef char reg_char;
 #endif
 
-#undef strncat
+#ifndef STRNCAT
+# define STRNCAT  strncat
+#endif
 
 char *
-strncat (s1, s2, n)
+STRNCAT (s1, s2, n)
      char *s1;
      const char *s2;
      size_t n;
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 88410b3..c959dd1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,14 +5,16 @@ endif
 
 ifeq ($(subdir),string)
 
-sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned
+		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   strcat-sse2-unaligned strncat-sse2-unaligned \
+		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 81b2378..0a145ca 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -97,18 +97,22 @@ __init_cpu_features (void)
 	    case 0x2c:
 	    case 0x2e:
 	    case 0x2f:
-	      /* Rep string instructions, copy backward and unaligned loads
-		 are fast on Intel Core i3, i5 and i7.  */
+	      /* Rep string instructions, copy backward, unaligned loads
+		 and pminub are fast on Intel Core i3, i5 and i7.  */
 #if index_Fast_Rep_String != index_Fast_Copy_Backward
 # error index_Fast_Rep_String != index_Fast_Copy_Backward
 #endif
 #if index_Fast_Rep_String != index_Fast_Unaligned_Load
 # error index_Fast_Rep_String != index_Fast_Unaligned_Load
 #endif
+#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+#endif
 	      __cpu_features.feature[index_Fast_Rep_String]
 		|= (bit_Fast_Rep_String
 		    | bit_Fast_Copy_Backward
-		    | bit_Fast_Unaligned_Load);
+		    | bit_Fast_Unaligned_Load
+		    | bit_Prefer_PMINUB_for_stringop);
 	      break;
 	    }
 	}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index addf5f3..6cfdbdd 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -21,6 +21,7 @@
 #define bit_Slow_BSF			(1 << 2)
 #define bit_Prefer_SSE_for_memop	(1 << 3)
 #define bit_Fast_Unaligned_Load		(1 << 4)
+#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
 
 #ifdef	__ASSEMBLER__
 
@@ -41,6 +42,7 @@
 # define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
new file mode 100644
index 0000000..1150281
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -0,0 +1,55 @@
+/* strcat with SSE2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_sse2_unaligned
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2-pminub.S"
+# undef RETURN
+
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-sse2-unaligned.S"
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000..66736a7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -0,0 +1,559 @@
+/* strcat with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_ssse3
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-no-bsf.S"
+
+# undef RETURN
+
+L(StartStrcpyPart):
+	mov	%rsi, %rcx
+	lea	(%rdi, %rax), %rdx
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(StrncatExit0)
+	cmp	$8, %r8
+	jbe	L(StrncatExit8Bytes)
+# endif
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmpb	$0, 7(%rcx)
+	jz	L(Exit8)
+	cmpb	$0, 8(%rcx)
+	jz	L(Exit9)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	jb	L(StrncatExit15Bytes)
+# endif
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmpb	$0, 14(%rcx)
+	jz	L(Exit15)
+	cmpb	$0, 15(%rcx)
+	jz	L(Exit16)
+# ifdef USE_AS_STRNCAT
+	cmp	$16, %r8
+	je	L(StrncatExit16)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-ssse3.S"
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit1):
+	xor	%ah, %ah
+	movb	%ah, 1(%rdx)
+L(Exit1):
+	movb	(%rcx), %al
+	movb	%al, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit2):
+	xor	%ah, %ah
+	movb	%ah, 2(%rdx)
+L(Exit2):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit3):
+	xor	%ah, %ah
+	movb	%ah, 3(%rdx)
+L(Exit3):
+	movw	(%rcx), %ax
+	movw	%ax, (%rdx)
+	movb	2(%rcx), %al
+	movb	%al, 2(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit4):
+	xor	%ah, %ah
+	movb	%ah, 4(%rdx)
+L(Exit4):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit5):
+	xor	%ah, %ah
+	movb	%ah, 5(%rdx)
+L(Exit5):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movb	4(%rcx), %al
+	movb	%al, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit6):
+	xor	%ah, %ah
+	movb	%ah, 6(%rdx)
+L(Exit6):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	movw	4(%rcx), %ax
+	movw	%ax, 4(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit7):
+	xor	%ah, %ah
+	movb	%ah, 7(%rdx)
+L(Exit7):
+	mov	(%rcx), %eax
+	mov	%eax, (%rdx)
+	mov	3(%rcx), %eax
+	mov	%eax, 3(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8):
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+L(Exit8):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit9):
+	xor	%ah, %ah
+	movb	%ah, 9(%rdx)
+L(Exit9):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movb	8(%rcx), %al
+	movb	%al, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit10):
+	xor	%ah, %ah
+	movb	%ah, 10(%rdx)
+L(Exit10):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movw	8(%rcx), %ax
+	movw	%ax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit11):
+	xor	%ah, %ah
+	movb	%ah, 11(%rdx)
+L(Exit11):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	7(%rcx), %eax
+	mov	%eax, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit12):
+	xor	%ah, %ah
+	movb	%ah, 12(%rdx)
+L(Exit12):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit13):
+	xor	%ah, %ah
+	movb	%ah, 13(%rdx)
+L(Exit13):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	5(%rcx), %xmm1
+	movlpd	%xmm1, 5(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit14):
+	xor	%ah, %ah
+	movb	%ah, 14(%rdx)
+L(Exit14):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	6(%rcx), %xmm1
+	movlpd	%xmm1, 6(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15):
+	xor	%ah, %ah
+	movb	%ah, 15(%rdx)
+L(Exit15):
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit16):
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+L(Exit16):
+	movlpd	(%rcx), %xmm0
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm0, (%rdx)
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %r8
+	add	%rsi, %rcx
+	lea	(%rsi, %rdx), %rsi
+	lea	-9(%r8), %rdx
+	and	$1<<7, %dh
+	or	%al, %dh
+	test	%dh, %dh
+	lea	(%rsi), %rdx
+	jz	L(ExitHighCase2)
+
+	test	$0x01, %al
+	jnz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	test	$0x10, %al
+	jnz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase2):
+	test	$0x01, %ah
+	jnz	L(Exit9)
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	test	$0x8, %ah
+	jnz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	test	$0x10, %ah
+	jnz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16BytesCase2)
+
+	.p2align 4
+L(CopyFrom1To16BytesCase3):
+	add	$16, %r8
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	cmp	$8, %r8
+	ja	L(ExitHighCase3)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	xor	%ah, %ah
+	movb	%ah, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHighCase3):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	cmp	$15, %r8
+	je	L(StrncatExit15)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	8(%rcx), %xmm1
+	movlpd	%xmm1, 8(%rdx)
+	xor	%ah, %ah
+	movb	%ah, 16(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit0):
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit15Bytes):
+	cmp	$9, %r8
+	je	L(StrncatExit9)
+	cmpb	$0, 9(%rcx)
+	jz	L(Exit10)
+	cmp	$10, %r8
+	je	L(StrncatExit10)
+	cmpb	$0, 10(%rcx)
+	jz	L(Exit11)
+	cmp	$11, %r8
+	je	L(StrncatExit11)
+	cmpb	$0, 11(%rcx)
+	jz	L(Exit12)
+	cmp	$12, %r8
+	je	L(StrncatExit12)
+	cmpb	$0, 12(%rcx)
+	jz	L(Exit13)
+	cmp	$13, %r8
+	je	L(StrncatExit13)
+	cmpb	$0, 13(%rcx)
+	jz	L(Exit14)
+	cmp	$14, %r8
+	je	L(StrncatExit14)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	movlpd	7(%rcx), %xmm1
+	movlpd	%xmm1, 7(%rdx)
+	lea	14(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(StrncatExit8Bytes):
+	cmpb	$0, (%rcx)
+	jz	L(Exit1)
+	cmp	$1, %r8
+	je	L(StrncatExit1)
+	cmpb	$0, 1(%rcx)
+	jz	L(Exit2)
+	cmp	$2, %r8
+	je	L(StrncatExit2)
+	cmpb	$0, 2(%rcx)
+	jz	L(Exit3)
+	cmp	$3, %r8
+	je	L(StrncatExit3)
+	cmpb	$0, 3(%rcx)
+	jz	L(Exit4)
+	cmp	$4, %r8
+	je	L(StrncatExit4)
+	cmpb	$0, 4(%rcx)
+	jz	L(Exit5)
+	cmp	$5, %r8
+	je	L(StrncatExit5)
+	cmpb	$0, 5(%rcx)
+	jz	L(Exit6)
+	cmp	$6, %r8
+	je	L(StrncatExit6)
+	cmpb	$0, 6(%rcx)
+	jz	L(Exit7)
+	cmp	$7, %r8
+	je	L(StrncatExit7)
+	movlpd	(%rcx), %xmm0
+	movlpd	%xmm0, (%rdx)
+	lea	7(%rdx), %rax
+	cmpb	$1, (%rax)
+	sbb	$-1, %rax
+	xor	%cl, %cl
+	movb	%cl, (%rax)
+	mov	%rdi, %rax
+	ret
+
+# endif
+END (STRCAT)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
new file mode 100644
index 0000000..f3ccc8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -0,0 +1,85 @@
+/* Multiple versions of strcat
+   Copyright (C) 2009, 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+#  define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3	         	__strncat_ssse3
+# define STRCAT_SSE2	            	__strncat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strncat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strncat
+# define __GI___STRCAT              __GI___strncat
+#else
+# define STRCAT_SSSE3	         	__strcat_ssse3
+# define STRCAT_SSE2	            	__strcat_sse2
+# define STRCAT_SSE2_UNALIGNED    	__strcat_sse2_unaligned
+# define __GI_STRCAT	            	__GI_strcat
+# define __GI___STRCAT              __GI___strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(STRCAT)
+	.type	STRCAT, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz	2f
+	leaq	STRCAT_SSE2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	STRCAT_SSSE3(%rip), %rax
+2:	ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type STRCAT_SSE2, @function; \
+	.align 16; \
+	STRCAT_SSE2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../strcat.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 9a8d186..6de8c47 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -20,10 +20,13 @@
 
 #ifndef NOT_IN_libc
 
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_sse2_unaligned
+#  endif
 
-# ifndef STRCPY
-#  define STRCPY  __strcpy_sse2_unaligned
 # endif
 
 # define JMPTBL(I, B)	I - B
@@ -33,16 +36,20 @@
 	lea	(%r11, %rcx), %rcx;                             \
 	jmp	*%rcx
 
-	.text
+# ifndef USE_AS_STRCAT
+
+.text
 ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
 	test	%r8, %r8
 	jz	L(ExitZero)
-# endif
+#  endif
 	mov	%rsi, %rcx
-# ifndef USE_AS_STPCPY
+#  ifndef USE_AS_STPCPY
 	mov	%rdi, %rax      /* save result */
+#  endif
+
 # endif
 
 	and	$15, %rcx
@@ -59,7 +66,7 @@ ENTRY (STRCPY)
 	pmovmskb %xmm1, %rdx
 	shr	%cl, %rdx
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$16, %r8
 #  else
 	cmp	$17, %r8
@@ -72,7 +79,7 @@ ENTRY (STRCPY)
 	pcmpeqb	16(%rsi), %xmm0
 	pmovmskb %xmm0, %rdx
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$32, %r8
 #  else
 	cmp	$33, %r8
@@ -102,7 +109,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -118,7 +125,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -134,7 +141,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -150,7 +157,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -166,7 +173,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -182,7 +189,7 @@ L(Unalign16Both):
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 # endif
 	test	%rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
 # else
 	jnz	L(CopyFrom1To16Bytes)
@@ -264,10 +271,10 @@ L(Unaligned64Leave):
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 	movdqu	%xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	48(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm7, 48(%rdi)
 	add	$15, %r8
 	sub	%rdx, %r8
@@ -288,7 +295,7 @@ L(SourceStringAlignmentZero):
 	pmovmskb %xmm0, %rdx
 
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$16, %r8
 #  else
 	cmp	$17, %r8
@@ -303,7 +310,7 @@ L(SourceStringAlignmentZero):
 	pmovmskb %xmm0, %rdx
 
 # ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
 	cmp	$32, %r8
 #  else
 	cmp	$33, %r8
@@ -314,11 +321,11 @@ L(SourceStringAlignmentZero):
 	jnz	L(CopyFrom1To32Bytes1)
 	jmp	L(Unalign16Both)
 
-/* ------End of main part with loops--------------------- */
+/*------End of main part with loops---------------------*/
 
 /* Case1 */
 
-# if (!defined USE_AS_STRNCPY)
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
 	.p2align 4
 L(CopyFrom1To16Bytes):
 	add	%rcx, %rdi
@@ -328,7 +335,7 @@ L(CopyFrom1To16Bytes):
 # endif
 	.p2align 4
 L(CopyFrom1To16BytesTail):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	%rcx, %r8
 # endif
 	add	%rcx, %rsi
@@ -339,7 +346,7 @@ L(CopyFrom1To16BytesTail):
 L(CopyFrom1To32Bytes1):
 	add	$16, %rsi
 	add	$16, %rdi
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$16, %r8
 # endif
 L(CopyFrom1To16BytesTail1):
@@ -348,7 +355,7 @@ L(CopyFrom1To16BytesTail1):
 
 	.p2align 4
 L(CopyFrom1To32Bytes):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	%rcx, %r8
 # endif
 	bsf	%rdx, %rdx
@@ -360,10 +367,10 @@ L(CopyFrom1To32Bytes):
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_0):
 	bsf	%rdx, %rdx
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm4, (%rdi)
 	add	$63, %r8
 	sub	%rdx, %r8
@@ -377,10 +384,10 @@ L(CopyFrom1To16BytesUnaligned_0):
 L(CopyFrom1To16BytesUnaligned_16):
 	bsf	%rcx, %rdx
 	movdqu	%xmm4, (%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	16(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm5, 16(%rdi)
 	add	$47, %r8
 	sub	%rdx, %r8
@@ -397,10 +404,10 @@ L(CopyFrom1To16BytesUnaligned_32):
 	bsf	%rdx, %rdx
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY
-#  ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
 	lea	32(%rdi, %rdx), %rax
-#  endif
+# endif
 	movdqu	%xmm6, 32(%rdi)
 	add	$31, %r8
 	sub	%rdx, %r8
@@ -413,6 +420,7 @@ L(CopyFrom1To16BytesUnaligned_32):
 # endif
 
 # ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
 	.p2align 4
 L(CopyFrom1To16BytesUnalignedXmm6):
 	movdqu	%xmm6, (%rdi, %rcx)
@@ -437,6 +445,7 @@ L(CopyFrom1To16BytesUnalignedXmm3):
 L(CopyFrom1To16BytesUnalignedXmm1):
 	movdqu	%xmm1, (%rdi, %rcx)
 	jmp	L(CopyFrom1To16BytesXmmExit)
+#  endif
 
 	.p2align 4
 L(CopyFrom1To16BytesExit):
@@ -519,7 +528,7 @@ L(CopyFrom1To16BytesTail1Case2OrCase3):
 
 # endif
 
-/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
 
 	.p2align 4
 L(Exit1):
@@ -527,7 +536,7 @@ L(Exit1):
 # ifdef USE_AS_STPCPY
 	lea	(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$1, %r8
 	lea	1(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -541,7 +550,7 @@ L(Exit2):
 # ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$2, %r8
 	lea	2(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -556,7 +565,7 @@ L(Exit3):
 # ifdef USE_AS_STPCPY
 	lea	2(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$3, %r8
 	lea	3(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -570,7 +579,7 @@ L(Exit4):
 # ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$4, %r8
 	lea	4(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -585,7 +594,7 @@ L(Exit5):
 # ifdef USE_AS_STPCPY
 	lea	4(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$5, %r8
 	lea	5(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -601,7 +610,7 @@ L(Exit6):
 # ifdef USE_AS_STPCPY
 	lea	5(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$6, %r8
 	lea	6(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -617,7 +626,7 @@ L(Exit7):
 # ifdef USE_AS_STPCPY
 	lea	6(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$7, %r8
 	lea	7(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -631,7 +640,7 @@ L(Exit8):
 # ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$8, %r8
 	lea	8(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -646,7 +655,7 @@ L(Exit9):
 # ifdef USE_AS_STPCPY
 	lea	8(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$9, %r8
 	lea	9(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -662,7 +671,7 @@ L(Exit10):
 # ifdef USE_AS_STPCPY
 	lea	9(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$10, %r8
 	lea	10(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -678,7 +687,7 @@ L(Exit11):
 # ifdef USE_AS_STPCPY
 	lea	10(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$11, %r8
 	lea	11(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -694,7 +703,7 @@ L(Exit12):
 # ifdef USE_AS_STPCPY
 	lea	11(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$12, %r8
 	lea	12(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -710,7 +719,7 @@ L(Exit13):
 # ifdef USE_AS_STPCPY
 	lea	12(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$13, %r8
 	lea	13(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -726,7 +735,7 @@ L(Exit14):
 # ifdef USE_AS_STPCPY
 	lea	13(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$14, %r8
 	lea	14(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -742,7 +751,7 @@ L(Exit15):
 # ifdef USE_AS_STPCPY
 	lea	14(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$15, %r8
 	lea	15(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -756,7 +765,7 @@ L(Exit16):
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$16, %r8
 	lea	16(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -771,7 +780,7 @@ L(Exit17):
 # ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$17, %r8
 	lea	17(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -787,7 +796,7 @@ L(Exit18):
 # ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$18, %r8
 	lea	18(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -803,7 +812,7 @@ L(Exit19):
 # ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$19, %r8
 	lea	19(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -819,7 +828,7 @@ L(Exit20):
 # ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$20, %r8
 	lea	20(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -836,7 +845,7 @@ L(Exit21):
 # ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$21, %r8
 	lea	21(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -852,7 +861,7 @@ L(Exit22):
 # ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$22, %r8
 	lea	22(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -868,7 +877,7 @@ L(Exit23):
 # ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$23, %r8
 	lea	23(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -884,7 +893,7 @@ L(Exit24):
 # ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$24, %r8
 	lea	24(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -901,7 +910,7 @@ L(Exit25):
 # ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$25, %r8
 	lea	25(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -919,7 +928,7 @@ L(Exit26):
 # ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$26, %r8
 	lea	26(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -937,7 +946,7 @@ L(Exit27):
 # ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$27, %r8
 	lea	27(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -955,7 +964,7 @@ L(Exit28):
 # ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$28, %r8
 	lea	28(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -971,7 +980,7 @@ L(Exit29):
 # ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$29, %r8
 	lea	29(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -987,7 +996,7 @@ L(Exit30):
 # ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$30, %r8
 	lea	30(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1003,7 +1012,7 @@ L(Exit31):
 # ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$31, %r8
 	lea	31(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1019,7 +1028,7 @@ L(Exit32):
 # ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
 # endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 	sub	$32, %r8
 	lea	32(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
@@ -1030,27 +1039,39 @@ L(Exit32):
 
 	.p2align 4
 L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	mov	%rdi, %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, (%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit1):
 	mov	(%rsi), %dl
 	mov	%dl, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 1(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit2):
 	mov	(%rsi), %dx
 	mov	%dx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	2(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 2(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1059,18 +1080,26 @@ L(StrncpyExit3):
 	mov	2(%rsi), %dl
 	mov	%cx, (%rdi)
 	mov	%dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 3(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit4):
 	mov	(%rsi), %edx
 	mov	%edx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	4(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 4(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1079,9 +1108,13 @@ L(StrncpyExit5):
 	mov	4(%rsi), %dl
 	mov	%ecx, (%rdi)
 	mov	%dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	5(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 5(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1090,9 +1123,13 @@ L(StrncpyExit6):
 	mov	4(%rsi), %dx
 	mov	%ecx, (%rdi)
 	mov	%dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	6(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 6(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1101,18 +1138,26 @@ L(StrncpyExit7):
 	mov	3(%rsi), %edx
 	mov	%ecx, (%rdi)
 	mov	%edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 7(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit8):
 	mov	(%rsi), %rdx
 	mov	%rdx, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	8(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 8(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1121,9 +1166,13 @@ L(StrncpyExit9):
 	mov	8(%rsi), %dl
 	mov	%rcx, (%rdi)
 	mov	%dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	9(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 9(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1132,9 +1181,13 @@ L(StrncpyExit10):
 	mov	8(%rsi), %dx
 	mov	%rcx, (%rdi)
 	mov	%dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	10(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 10(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1143,9 +1196,13 @@ L(StrncpyExit11):
 	mov	7(%rsi), %edx
 	mov	%rcx, (%rdi)
 	mov	%edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	11(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 11(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1154,9 +1211,13 @@ L(StrncpyExit12):
 	mov	8(%rsi), %edx
 	mov	%rcx, (%rdi)
 	mov	%edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	12(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 12(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1165,9 +1226,13 @@ L(StrncpyExit13):
 	mov	5(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	13(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 13(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1176,9 +1241,13 @@ L(StrncpyExit14):
 	mov	6(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	14(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 14(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1187,18 +1256,26 @@ L(StrncpyExit15):
 	mov	7(%rsi), %rdx
 	mov	%rcx, (%rdi)
 	mov	%rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 15(%rdi)
+#  endif
 	ret
 
 	.p2align 4
 L(StrncpyExit16):
 	movdqu	(%rsi), %xmm0
 	movdqu	%xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	16(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 16(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1207,9 +1284,13 @@ L(StrncpyExit17):
 	mov	16(%rsi), %cl
 	movdqu	%xmm0, (%rdi)
 	mov	%cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	17(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 17(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1218,9 +1299,13 @@ L(StrncpyExit18):
 	mov	16(%rsi), %cx
 	movdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	18(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 18(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1229,9 +1314,13 @@ L(StrncpyExit19):
 	mov	15(%rsi), %ecx
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	19(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 19(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1240,9 +1329,13 @@ L(StrncpyExit20):
 	mov	16(%rsi), %ecx
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	20(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 20(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1253,9 +1346,13 @@ L(StrncpyExit21):
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	21(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 21(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1264,9 +1361,13 @@ L(StrncpyExit22):
 	mov	14(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	22(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 22(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1275,9 +1376,13 @@ L(StrncpyExit23):
 	mov	15(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	23(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 23(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1286,9 +1391,13 @@ L(StrncpyExit24):
 	mov	16(%rsi), %rcx
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	24(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 24(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1299,9 +1408,13 @@ L(StrncpyExit25):
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	25(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 25(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1312,9 +1425,13 @@ L(StrncpyExit26):
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	26(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 26(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1325,9 +1442,13 @@ L(StrncpyExit27):
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	27(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 27(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1338,9 +1459,13 @@ L(StrncpyExit28):
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	28(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 28(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1349,9 +1474,13 @@ L(StrncpyExit29):
 	movdqu	13(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	29(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 29(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1360,9 +1489,13 @@ L(StrncpyExit30):
 	movdqu	14(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	30(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 30(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1371,9 +1504,13 @@ L(StrncpyExit31):
 	movdqu	15(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	31(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 31(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1382,9 +1519,13 @@ L(StrncpyExit32):
 	movdqu	16(%rsi), %xmm2
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	32(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 32(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1395,8 +1536,14 @@ L(StrncpyExit33):
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
 	mov	%cl, 32(%rdi)
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 33(%rdi)
+#  endif
 	ret
 
+#  ifndef USE_AS_STRCAT
+
 	.p2align 4
 L(Fill0):
 	ret
@@ -1498,9 +1645,9 @@ L(CopyFrom1To16BytesXmmExit):
 	bsf	%rdx, %rdx
 	add	$15, %r8
 	add	%rcx, %rdi
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
-# endif
+#   endif
 	sub	%rdx, %r8
 	lea	1(%rdi, %rdx), %rdi
 
@@ -1553,6 +1700,9 @@ L(StrncpyFillExit):
 	add	$16, %r8
 	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
 
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
 	.p2align 4
 L(UnalignedLeaveCase2OrCase3):
 	test	%rdx, %rdx
@@ -1572,9 +1722,13 @@ L(Unaligned64LeaveCase3):
 	sub	$16, %r8
 	jb	L(CopyFrom1To16BytesCase3)
 	movdqu	%xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	64(%rdi), %rax
-# endif
+#  endif
+#  ifdef USE_AS_STRCAT
+	xor	%ch, %ch
+	movb	%ch, 64(%rdi)
+#  endif
 	ret
 
 	.p2align 4
@@ -1585,8 +1739,11 @@ L(Unaligned64LeaveCase2):
 	add	$48, %r8
 	jle	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
-
+#  else
+        jnz	L(CopyFrom1To16Bytes)
+#  endif
 	pcmpeqb	%xmm5, %xmm0
 	pmovmskb %xmm0, %rdx
 	movdqu	%xmm4, (%rdi)
@@ -1594,7 +1751,11 @@ L(Unaligned64LeaveCase2):
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+#  else
+        jnz	L(CopyFrom1To16Bytes)
+#  endif
 
 	pcmpeqb	%xmm6, %xmm0
 	pmovmskb %xmm0, %rdx
@@ -1603,7 +1764,11 @@ L(Unaligned64LeaveCase2):
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
 	test	%rdx, %rdx
+#  ifndef USE_AS_STRCAT
 	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+#  else
+        jnz	L(CopyFrom1To16Bytes)
+#  endif
 
 	pcmpeqb	%xmm7, %xmm0
 	pmovmskb %xmm0, %rdx
@@ -1617,13 +1782,18 @@ L(Unaligned64LeaveCase2):
 
 	.p2align 4
 L(ExitZero):
+#  ifndef USE_AS_STRCAT
 	mov	%rdi, %rax
+#  endif
 	ret
 
 # endif
 
+# ifndef USE_AS_STRCAT
 END (STRCPY)
-
+# else
+END (STRCAT)
+# endif
 	.p2align 4
 	.section .rodata
 L(ExitTable):
@@ -1695,6 +1865,7 @@ L(ExitStrncpyTable):
 	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
 	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+#  ifndef USE_AS_STRCAT
 	.p2align 4
 L(FillTable):
 	.int	JMPTBL(L(Fill0), L(FillTable))
@@ -1714,5 +1885,7 @@ L(FillTable):
 	.int	JMPTBL(L(Fill14), L(FillTable))
 	.int	JMPTBL(L(Fill15), L(FillTable))
 	.int	JMPTBL(L(Fill16), L(FillTable))
+#  endif
 # endif
 #endif
+
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index efbd3bf..05faf0d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -20,25 +20,26 @@
 
 #ifndef NOT_IN_libc
 
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
 
-# ifndef STRCPY
-#  define STRCPY  __strcpy_ssse3
-# endif
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_ssse3
+#  endif
 
 	.section .text.ssse3,"ax",@progbits
 ENTRY (STRCPY)
 	mov	%rsi, %rcx
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
-# endif
+#  endif
 	mov	%rdi, %rdx
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	test	%r8, %r8
 	jz	L(Exit0)
 	cmp	$8, %r8
 	jbe	L(StrncpyExit8Bytes)
-# endif
+#  endif
 	cmpb	$0, (%rcx)
 	jz	L(Exit1)
 	cmpb	$0, 1(%rcx)
@@ -55,10 +56,10 @@ ENTRY (STRCPY)
 	jz	L(Exit7)
 	cmpb	$0, 7(%rcx)
 	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	jb	L(StrncpyExit15Bytes)
-# endif
+#  endif
 	cmpb	$0, 8(%rcx)
 	jz	L(Exit9)
 	cmpb	$0, 9(%rcx)
@@ -73,12 +74,13 @@ ENTRY (STRCPY)
 	jz	L(Exit14)
 	cmpb	$0, 14(%rcx)
 	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	je	L(Exit16)
-# endif
+#  endif
 	cmpb	$0, 15(%rcx)
 	jz	L(Exit16)
+# endif
 
 # ifdef USE_AS_STRNCPY
 	mov	%rcx, %rsi
@@ -2180,12 +2182,12 @@ L(Shl15LoopExit):
 	jmp	L(CopyFrom1To16Bytes)
 # endif
 
-
+# ifndef USE_AS_STRCAT
 	.p2align 4
 L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	add	$16, %r8
-# endif
+#  endif
 	add	%rsi, %rdx
 	add	%rsi, %rcx
 
@@ -2210,20 +2212,20 @@ L(CopyFrom1To16Bytes):
 L(Exit8):
 	mov	(%rcx), %rax
 	mov	%rax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	7(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$8, %r8
 	lea	8(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2249,23 +2251,23 @@ L(Exit16):
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	15(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	lea	16(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 
 	.p2align 4
 L(CopyFrom1To16BytesCase2):
@@ -2381,46 +2383,46 @@ L(Less12Case3): /* but more than 8 */
 	jl	L(Exit9)
 	je	L(Exit10)
 	jg	L(Exit11)
-# endif
+#  endif
 
 	.p2align 4
 L(Exit1):
 	movb	(%rcx), %al
 	movb	%al, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$1, %r8
 	lea	1(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
 L(Exit2):
 	movw	(%rcx), %ax
 	movw	%ax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	1(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$2, %r8
 	lea	2(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2429,40 +2431,40 @@ L(Exit3):
 	movw	%ax, (%rdx)
 	movb	2(%rcx), %al
 	movb	%al, 2(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	2(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$3, %r8
 	lea	3(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
 L(Exit4):
 	movl	(%rcx), %eax
 	movl	%eax, (%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	3(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$4, %r8
 	lea	4(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#  endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2471,20 +2473,20 @@ L(Exit5):
 	movl	%eax, (%rdx)
 	movb	4(%rcx), %al
 	movb	%al, 4(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	4(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$5, %r8
 	lea	5(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2493,20 +2495,20 @@ L(Exit6):
 	movl	%eax, (%rdx)
 	movw	4(%rcx), %ax
 	movw	%ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	5(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$6, %r8
 	lea	6(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2515,20 +2517,20 @@ L(Exit7):
 	movl	%eax, (%rdx)
 	movl	3(%rcx), %eax
 	movl	%eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	6(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$7, %r8
 	lea	7(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2537,20 +2539,20 @@ L(Exit9):
 	mov	%rax, (%rdx)
 	mov	5(%rcx), %eax
 	mov	%eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	8(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$9, %r8
 	lea	9(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2559,20 +2561,20 @@ L(Exit10):
 	mov	%rax, (%rdx)
 	mov	6(%rcx), %eax
 	mov	%eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	9(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$10, %r8
 	lea	10(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2581,20 +2583,20 @@ L(Exit11):
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %eax
 	mov	%eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	10(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$11, %r8
 	lea	11(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2603,20 +2605,20 @@ L(Exit12):
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %eax
 	mov	%eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	11(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$12, %r8
 	lea	12(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2625,20 +2627,20 @@ L(Exit13):
 	mov	%rax, (%rdx)
 	mov	5(%rcx), %rax
 	mov	%rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	12(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$13, %r8
 	lea	13(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2647,20 +2649,20 @@ L(Exit14):
 	mov	%rax, (%rdx)
 	mov	6(%rcx), %rax
 	mov	%rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	13(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$14, %r8
 	lea	14(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
 	.p2align 4
@@ -2669,23 +2671,23 @@ L(Exit15):
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %rax
 	mov	%rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#  ifdef USE_AS_STPCPY
 	lea	14(%rdx), %rax
-# else
+#  else
 	mov	%rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+#  endif
+#  ifdef USE_AS_STRNCPY
 	sub	$15, %r8
 	lea	15(%rdx), %rcx
 	jnz	L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# endif
-# endif
+#   endif
+#  endif
 	ret
 
-# ifdef USE_AS_STRNCPY
+#  ifdef USE_AS_STRNCPY
 	.p2align 4
 L(Fill0):
 	ret
@@ -2902,13 +2904,13 @@ L(StrncpyExit15Bytes):
 	mov	%rax, (%rdx)
 	mov	7(%rcx), %rax
 	mov	%rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	14(%rdx), %rax
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# else
+#   else
 	mov	%rdi, %rax
-# endif
+#   endif
 	ret
 
 	.p2align 4
@@ -2943,15 +2945,17 @@ L(StrncpyExit8Bytes):
 	jz	L(Exit7)
 	mov	(%rcx), %rax
 	mov	%rax, (%rdx)
-# ifdef USE_AS_STPCPY
+#   ifdef USE_AS_STPCPY
 	lea	7(%rdx), %rax
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-# else
+#   else
 	mov	%rdi, %rax
-# endif
+#   endif
 	ret
 
+#  endif
+
 # endif
 
 # ifdef USE_AS_STRNCPY
@@ -3715,7 +3719,7 @@ L(StrncpyExit15):
 	lea	1(%rsi), %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 # endif
-
+# ifndef USE_AS_STRCAT
 END (STRCPY)
-
+# endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S
index 3e52f81..c730e0a 100644
--- a/sysdeps/x86_64/multiarch/strlen-no-bsf.S
+++ b/sysdeps/x86_64/multiarch/strlen-no-bsf.S
@@ -1,5 +1,5 @@
-/* strlen without BSF
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* strlen SSE2 without bsf
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -18,12 +18,17 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#if defined SHARED && !defined NOT_IN_libc
+#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc
 
-#include <sysdep.h>
+# ifndef USE_AS_STRCAT
 
-	.section .text.slow,"ax",@progbits
+#  include <sysdep.h>
+
+#  define RETURN ret
+
+	.section .text.sse2,"ax",@progbits
 ENTRY (__strlen_no_bsf)
+# endif
 	xor	%eax, %eax
 	cmpb	$0, (%rdi)
 	jz	L(exit_tail0)
@@ -165,39 +170,37 @@ ENTRY (__strlen_no_bsf)
 	jnz	L(exit)
 
 	and	$-0x40, %rax
-	xor	%r8d, %r8d
 L(aligned_64):
 	pcmpeqb	(%rax), %xmm0
 	pcmpeqb	16(%rax), %xmm1
 	pcmpeqb	32(%rax), %xmm2
 	pcmpeqb	48(%rax), %xmm3
 	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %esi
-	pmovmskb %xmm2, %edi
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
 	pmovmskb %xmm3, %r9d
-	or	%edx, %r8d
-	or	%esi, %r8d
-	or	%edi, %r8d
-	or	%r9d, %r8d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
 	lea	64(%rax), %rax
 	jz	L(aligned_64)
 
 	test	%edx, %edx
 	jnz	L(aligned_64_exit_16)
-	test	%esi, %esi
+	test	%r11d, %r11d
 	jnz	L(aligned_64_exit_32)
-	test	%edi, %edi
+	test	%r10d, %r10d
 	jnz	L(aligned_64_exit_48)
 L(aligned_64_exit_64):
-	mov	%r9d, %edx
+	pmovmskb %xmm3, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_48):
 	lea	-16(%rax), %rax
-	mov	%edi, %edx
+	mov	%r10d, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_32):
 	lea	-32(%rax), %rax
-	mov	%esi, %edx
+	mov	%r11d, %edx
 	jmp	L(aligned_64_exit)
 L(aligned_64_exit_16):
 	lea	-48(%rax), %rax
@@ -228,7 +231,7 @@ L(exit):
 	jnz	L(exit_tail6)
 	add	$7, %eax
 L(exit_tail0):
-	ret
+	RETURN
 
 L(exit_high):
 	add	$8, %eax
@@ -253,57 +256,58 @@ L(exit_high):
 	test	$0x40, %dh
 	jnz	L(exit_tail6)
 	add	$7, %eax
-	ret
+	RETURN
 	.p2align 4
 L(exit_tail1):
 	add	$1, %eax
-	ret
+	RETURN
 
 L(exit_tail2):
 	add	$2, %eax
-	ret
+	RETURN
 
 L(exit_tail3):
 	add	$3, %eax
-	ret
+	RETURN
 
 L(exit_tail4):
 	add	$4, %eax
-	ret
+	RETURN
 
 L(exit_tail5):
 	add	$5, %eax
-	ret
+	RETURN
 L(exit_tail6):
 	add	$6, %eax
-	ret
+	RETURN
 L(exit_tail7):
 	add	$7, %eax
-	ret
+	RETURN
 L(exit_tail8):
 	add	$8, %eax
-	ret
+	RETURN
 L(exit_tail9):
 	add	$9, %eax
-	ret
+	RETURN
 L(exit_tail10):
 	add	$10, %eax
-	ret
+	RETURN
 L(exit_tail11):
 	add	$11, %eax
-	ret
+	RETURN
 L(exit_tail12):
 	add	$12, %eax
-	ret
+	RETURN
 L(exit_tail13):
 	add	$13, %eax
-	ret
+	RETURN
 L(exit_tail14):
 	add	$14, %eax
-	ret
+	RETURN
 L(exit_tail15):
 	add	$15, %eax
-	ret
+# ifndef USE_AS_STRCAT
+	RETURN
 END (__strlen_no_bsf)
-
+# endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
new file mode 100644
index 0000000..57778cf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
@@ -0,0 +1,260 @@
+/* strlen SSE2
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+
+#  define RETURN ret
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (__strlen_sse2_pminub)
+
+# endif
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+	
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax), 	%xmm4
+	movaps	32(%rax), 	%xmm5
+	pminub	48(%rax), 	%xmm5
+	add	$64, 	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80, 	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	RETURN
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	RETURN
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	RETURN
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	RETURN
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	RETURN
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+# ifndef USE_AS_STRCAT
+	RETURN
+
+END (__strlen_sse2_pminub)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 83a88ec..d789707 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -32,7 +32,10 @@ ENTRY(strlen)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__strlen_sse2(%rip), %rax
+1:	leaq	__strlen_sse2_pminub(%rip), %rax
+	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+	jnz	2f
+	leaq	__strlen_sse2(%rip), %rax
 	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
 	jz	2f
 	leaq	__strlen_sse42(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
new file mode 100644
index 0000000..a3cdbff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
+#endif
+
+#include "string/strncat.c"
diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
new file mode 100644
index 0000000..133e1d2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_sse2_unaligned
+#include "strcat-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000..6c45ff3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_ssse3
+#include "strcat-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S
new file mode 100644
index 0000000..fd569c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat.S
@@ -0,0 +1,3 @@
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                         |   29 ++
 NEWS                                              |    5 +-
 string/strncat.c                                  |    6 +-
 sysdeps/x86_64/multiarch/Makefile                 |    6 +-
 sysdeps/x86_64/multiarch/init-arch.c              |   10 +-
 sysdeps/x86_64/multiarch/init-arch.h              |    2 +
 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S  |   54 ++
 sysdeps/x86_64/multiarch/strcat-ssse3.S           |  558 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strcat.S                 |   85 ++++
 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S  |  450 ++++++++++++-----
 sysdeps/x86_64/multiarch/strcpy-ssse3.S           |  280 ++++++-----
 sysdeps/x86_64/multiarch/strlen-no-bsf.S          |   74 ++--
 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S     |  260 ++++++++++
 sysdeps/x86_64/multiarch/strlen.S                 |    5 +-
 sysdeps/x86_64/multiarch/strncat-c.c              |    8 +
 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S |    3 +
 sysdeps/x86_64/multiarch/strncat-ssse3.S          |    3 +
 sysdeps/x86_64/multiarch/strncat.S                |    3 +
 18 files changed, 1520 insertions(+), 321 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/strcat.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
 create mode 100644 sysdeps/x86_64/multiarch/strncat-c.c
 create mode 100644 sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/strncat.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]