This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH RFC 2/2 V3] Improve 64bit memset for Corei7 with avx2 instruction


From: Ma Ling <ling.ml@alibaba-inc.com>

In this patch we use the similar approach with memcpy to
avoid branch instructions and force destination to be aligned
with avx instruction. By gcc.403 benchmark we find memset
spend more time than memcpy by 5~20 times.

The benchmark also indicate this patch improve performance
from  19% to 59% compared with original memset implemented by sse2.

case	avx2			sse2			avx2 vs sse2
200i	3464384877		4908415928		1.416821774
g23		5383480843		6412897622		1.191217691
166i	1918420168		2641790543		1.377065664
cp-decl	1827844868		2482888167		1.358369198
c-type	9541987519		11918389781		1.249046884
expr2	5198536021		6715679414		1.291840508
expr	3521295060		4557375470		1.294232773
s04		8899036518		11009111003		1.23711269
scilab	1205644741		1925246735		1.596860725


Thanks
Ling
---
In this version our patch is based on commit-id:641aa7b45991b6564a8fa825c681ad6ad1c7721f,
so the comparied result is different with last versions.

 sysdeps/x86_64/multiarch/Makefile      |   3 +-
 sysdeps/x86_64/multiarch/memset-avx2.S | 202 +++++++++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
 sysdeps/x86_64/multiarch/memset_chk.S  |  44 +++++++
 4 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/memset.S
 create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..aae7aae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 \
+		   memset-avx2
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..1c3796b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,202 @@
+/* memset with AVX2
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+#ifndef MEMSET
+# define MEMSET	__memset_avx2
+# define MEMSET_CHK	__memset_chk_avx2
+#endif
+
+	.section .text.avx2,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd %esi, %xmm1
+	lea	(%rdi, %rdx), %r8
+	vpshufb	%xmm0, %xmm1, %xmm0
+	mov	%rdi, %rax
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	xor	%ecx, %ecx
+	mov %sil, %cl
+	mov %cl, %ch
+	cmp	$128, %rdx
+	jb	L(less_128bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, 0x40(%rdi)
+	vmovups %xmm0, 0x50(%rdi)
+	vmovups %xmm0, 0x60(%rdi)
+	vmovups %xmm0, 0x70(%rdi)
+	vmovups %xmm0, -0x80(%r8)
+	vmovups %xmm0, -0x70(%r8)
+	vmovups %xmm0, -0x60(%r8)
+	vmovups %xmm0, -0x50(%r8)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	xor	%esi, %esi
+	mov	%ecx, %esi
+	shl	$16, %ecx
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	orl	%esi, %ecx
+	mov	%ecx, %esi
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_32bytes):
+	shl	$32, %rcx
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_16bytes):
+	or	%rsi, %rcx
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	mov %rcx, (%rdi)
+	mov %rcx, -0x08(%r8)
+	ret
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov %ecx, (%rdi)
+	mov %ecx, -0x04(%r8)
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%r8)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %edx
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+	vinserti128 $1, %xmm0, %ymm0, %ymm0
+	vmovups	%ymm0, (%rdi)
+	mov	%rdi, %r9
+	and	$-0x20, %rdi
+	add	$32, %rdi
+	sub	%rdi, %r9
+	add	%r9, %rdx
+	cmp	$4096, %rdx
+	ja	L(gobble_data)
+
+	sub	$0x80, %rdx
+L(gobble_128_loop):
+	prefetcht0	0x1c0(%rdi)
+	vmovaps	%ymm0, (%rdi)
+	prefetcht0	0x280(%rdi)
+	vmovaps	%ymm0, 0x20(%rdi)
+	vmovaps	%ymm0, 0x40(%rdi)
+	vmovaps	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_128_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r9
+#else
+	mov	__x86_shared_cache_size_half(%rip), %r9
+#endif
+	shl	$4, %r9
+	cmp	%r9, %rdx
+	ja	L(gobble_big_data)
+	mov	%rax, %r9
+	mov	%esi, %eax
+	mov	%rdx, %rcx
+	rep	stosb
+	mov	%r9, %rax
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_big_data):
+	sub	$0x80, %rdx
+L(gobble_big_data_loop):
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm0, 0x20(%rdi)
+	vmovntdq	%ymm0, 0x40(%rdi)
+	vmovntdq	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_big_data_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	sfence
+	ret
+
+END (MEMSET)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000..c15a80a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memset_sse2(%rip), %rax
+	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	jz	2f
+	leaq	__memset_avx2(%rip), %rax
+2:	ret
+END(memset)
+#endif
+
+#if !defined NOT_IN_libc
+# undef memset
+# define memset __memset_sse2
+
+# undef __memset_chk
+# define __memset_chk __memset_chk_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+   The speedup we get from using GPR instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
new file mode 100644
index 0000000..2dee256
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -0,0 +1,44 @@
+/* Multiple versions of __memset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2010-2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ENTRY(__memset_chk)
+	.type	__memset_chk, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memset_chk_sse2(%rip), %rax
+	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	jz	2f
+	leaq	__memset_chk_avx2(%rip), %rax
+2:	ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+	.section .gnu.warning.__memset_zero_constant_len_parameter
+	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+#  include "../memset_chk.S"
+# endif
+#endif
-- 
1.8.1.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]