This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction

From: ling dot ma dot program at gmail dot com
To: libc-alpha at sourceware dot org
Cc: aj at suse dot com, neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
Date: Mon, 29 Jul 2013 05:40:55 -0400
Subject: [PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
From: Ma Ling <ling.ml@alibaba-inc.com>

We manage to avoid branch instructions, and force destination to be aligned
with avx instruction, then modified gcc.403 so that we can only measure memcpy function, 
gcc.403 benchmarks indicate the version improved performance from 4% to 14%
cmpaired with memcpy_sse2_unaligned on haswell machine.

case	avx_unaligned	sse2_unaligned	AVX vs SSE2
200i	146833745		168384142	1.146767332
g23		1431207341		1557405243	1.088175835
166i	350901531		379068674	1.08027079
cp-decl	370750774		395890196	1.067806796
c-type	763780824		810806468	1.061569553
expr2	986698539		1067232192	1.081619309
expr	727016829		758953883	1.043928906
s04		1117900758		1185159528	1.060165242
scilab	63309111		66893431	1.05661618
(We will send test patch on memcpy for above cases)

Thanks
Ling
---
In this version our patch is based on commit-id:641aa7b45991b6564a8fa825c681ad6ad1c7721f,
so the comparied result is different with last versions.

 sysdeps/x86_64/multiarch/Makefile                |   1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |  12 +
 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S  | 393 +++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S |   4 +
 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S |   4 +
 5 files changed, 414 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..f622429 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,6 +9,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+		   memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d3579..449f75b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+			      __memmove_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -215,6 +219,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -224,6 +230,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+			      __memcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -232,6 +240,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -241,6 +251,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+			      __mempcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000..005cfb7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,393 @@
+/* memcpy with AVX
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+#ifndef MEMCPY
+# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+#endif
+
+	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+	lea	(%rsi, %rdx), %r8
+	lea	(%rdi, %rdx), %r9
+	cmp	$256, %rdx
+	ja	L(256bytesormore)
+	cmp	$128, %edx
+	jb	L(less_128bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups 0x40(%rsi), %xmm4
+	vmovups 0x50(%rsi), %xmm5
+	vmovups 0x60(%rsi), %xmm6
+	vmovups 0x70(%rsi), %xmm7
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, 0x40(%rdi)
+	vmovups %xmm5, 0x50(%rdi)
+	vmovups %xmm6, 0x60(%rdi)
+	vmovups %xmm7, 0x70(%rdi)
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups -0x40(%r8), %xmm4
+	vmovups -0x30(%r8), %xmm5
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, -0x40(%r9)
+	vmovups %xmm5, -0x30(%r9)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_32bytes):
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups (%rsi), %xmm0
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_16bytes):
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	movq (%rsi),	%rcx
+	movq -0x08(%r8),	%r10
+	movq %rcx, (%rdi)
+	movq %r10, -0x08(%r9)
+	ret
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov (%rsi),	%ecx
+	mov -0x04(%r8), %edx
+	mov %ecx, (%rdi)
+	mov %edx, -0x04(%r9)
+	ret
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov (%rsi),	%cx
+	mov -0x02(%r8),	%dx
+	mov %cx, (%rdi)
+	mov %dx, -0x02(%r9)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %rdx
+	jb	L(less_0bytes)
+	mov	(%rsi), %cl
+	mov	%cl,	(%rdi)
+L(less_0bytes):
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jae	L(copy_backward)
+#endif
+	cmp	$2048, %rdx
+	jae	L(gobble_data_movsb)
+
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups	(%rsi), %ymm4
+	mov	%rdi, %r10
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r11
+	sub	%r10, %r11
+	sub	%r11, %rdx
+	add	%r11, %rsi
+	sub	$0x80, %rdx
+L(goble_128_loop):
+	vmovups (%rsi), %ymm0
+	vmovups 0x20(%rsi), %ymm1
+	vmovups 0x40(%rsi), %ymm2
+	vmovups 0x60(%rsi), %ymm3
+	lea	0x80(%rsi), %rsi
+	vmovaps %ymm0, (%rdi)
+	vmovaps %ymm1, 0x20(%rdi)
+	vmovaps %ymm2, 0x40(%rdi)
+	vmovaps %ymm3, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(goble_128_loop)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r10
+	sub	%rdi, %r10
+	cmp	%rdx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	cmp	%rcx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+	cmp	%rcx, %rdx
+	jae	L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+	mov	%rdx, %rcx
+	rep	movsb
+	ret
+
+L(gobble_big_data_fwd):
+	vmovups	(%rsi), %ymm4
+	vmovups -0x80(%r8), %xmm5
+	vmovups -0x70(%r8), %xmm6
+	vmovups -0x60(%r8), %xmm7
+	vmovups -0x50(%r8), %xmm8
+	vmovups -0x40(%r8), %xmm9
+	vmovups -0x30(%r8), %xmm10
+	vmovups -0x20(%r8), %xmm11
+	vmovups -0x10(%r8), %xmm12
+	mov	%rdi, %r8
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r10
+	sub	%r8, %r10
+	sub	%r10, %rdx
+	add	%r10, %rsi
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	vmovups	(%rsi), %ymm0
+	vmovups	0x20(%rsi), %ymm1
+	vmovups	0x40(%rsi), %ymm2
+	vmovups	0x60(%rsi), %ymm3
+	lea	0x80(%rsi), %rsi
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm1, 0x20(%rdi)
+	vmovntdq	%ymm2, 0x40(%rdi)
+	vmovntdq	%ymm3, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	vmovups	%ymm4, (%r8)
+	vzeroupper
+	vmovups %xmm5, -0x80(%r9)
+	vmovups %xmm6, -0x70(%r9)
+	vmovups %xmm7, -0x60(%r9)
+	vmovups %xmm8, -0x50(%r9)
+	vmovups %xmm9, -0x40(%r9)
+	vmovups %xmm10, -0x30(%r9)
+	vmovups %xmm11, -0x20(%r9)
+	vmovups %xmm12, -0x10(%r9)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+	vmovups (%rsi), %xmm8
+	vmovups 0x10(%rsi), %xmm9
+	vmovups 0x20(%rsi), %xmm10
+	vmovups 0x30(%rsi), %xmm11
+	vmovups 0x40(%rsi), %xmm12
+	vmovups 0x50(%rsi), %xmm13
+	vmovups 0x60(%rsi), %xmm14
+	vmovups 0x70(%rsi), %xmm15
+	mov	%rdi, %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	vmovups	-0x20(%rsi), %ymm4
+	lea	-0x20(%rdi), %r10
+	mov %rdi, %r11
+	and	$0x1f, %r11
+	xor	%r11, %rdi
+	sub	%r11, %rsi
+	sub	%r11, %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r11
+	sub	%rsi, %r11
+	cmp	%rdx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	cmp	%rcx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	jmp L(gobble_mem_bwd_llc_start)
+#endif
+L(memmove_use_memcpy_bwd):
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_llc):
+	vmovups	-0x20(%rsi), %ymm0
+	vmovups	-0x40(%rsi), %ymm1
+	vmovups	-0x60(%rsi), %ymm2
+	vmovups	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovaps	%ymm0, -0x20(%rdi)
+	vmovaps	%ymm1, -0x40(%rdi)
+	vmovaps	%ymm2, -0x60(%rdi)
+	vmovaps	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_llc)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+
+L(gobble_big_data_bwd):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	vmovups	-0x20(%rsi), %ymm0
+	vmovups	-0x40(%rsi), %ymm1
+	vmovups	-0x60(%rsi), %ymm2
+	vmovups	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovntdq	%ymm0, -0x20(%rdi)
+	vmovntdq	%ymm1, -0x40(%rdi)
+	vmovntdq	%ymm2, -0x60(%rdi)
+	vmovntdq	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+#endif
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644
index 0000000..352a2c3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_avx_unaligned
+#define MEMCPY_CHK	__memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644
index 0000000..b31394e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_avx_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
-- 
1.8.1.4
Follow-Ups:
- Re: [PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: Andreas Jaeger
- Re: [PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
  - From: Andreas Jaeger
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]