This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH RFC 2/2 V3] Improve 64bit memset for Corei7 with avx2 instruction
- From: ling dot ma dot program at gmail dot com
- To: libc-alpha at sourceware dot org
- Cc: aj at suse dot com, neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
- Date: Mon, 29 Jul 2013 05:42:02 -0400
- Subject: [PATCH RFC 2/2 V3] Improve 64bit memset for Corei7 with avx2 instruction
From: Ma Ling <ling.ml@alibaba-inc.com>
In this patch we use the similar approach with memcpy to
avoid branch instructions and force destination to be aligned
with avx instruction. By gcc.403 benchmark we find memset
spend more time than memcpy by 5~20 times.
The benchmark also indicate this patch improve performance
from 19% to 59% compared with original memset implemented by sse2.
case avx2 sse2 avx2 vs sse2
200i 3464384877 4908415928 1.416821774
g23 5383480843 6412897622 1.191217691
166i 1918420168 2641790543 1.377065664
cp-decl 1827844868 2482888167 1.358369198
c-type 9541987519 11918389781 1.249046884
expr2 5198536021 6715679414 1.291840508
expr 3521295060 4557375470 1.294232773
s04 8899036518 11009111003 1.23711269
scilab 1205644741 1925246735 1.596860725
Thanks
Ling
---
In this version our patch is based on commit-id:641aa7b45991b6564a8fa825c681ad6ad1c7721f,
so the comparied result is different with last versions.
sysdeps/x86_64/multiarch/Makefile | 3 +-
sysdeps/x86_64/multiarch/memset-avx2.S | 202 +++++++++++++++++++++++++++++++++
sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++
sysdeps/x86_64/multiarch/memset_chk.S | 44 +++++++
4 files changed, 307 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/memset.S
create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..aae7aae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 \
+ memset-avx2
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..1c3796b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,202 @@
+/* memset with AVX2
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+#ifndef MEMSET
+# define MEMSET __memset_avx2
+# define MEMSET_CHK __memset_chk_avx2
+#endif
+
+ .section .text.avx2,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovd %esi, %xmm1
+ lea (%rdi, %rdx), %r8
+ vpshufb %xmm0, %xmm1, %xmm0
+ mov %rdi, %rax
+ cmp $256, %rdx
+ jae L(256bytesormore)
+ xor %ecx, %ecx
+ mov %sil, %cl
+ mov %cl, %ch
+ cmp $128, %rdx
+ jb L(less_128bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, 0x20(%rdi)
+ vmovups %xmm0, 0x30(%rdi)
+ vmovups %xmm0, 0x40(%rdi)
+ vmovups %xmm0, 0x50(%rdi)
+ vmovups %xmm0, 0x60(%rdi)
+ vmovups %xmm0, 0x70(%rdi)
+ vmovups %xmm0, -0x80(%r8)
+ vmovups %xmm0, -0x70(%r8)
+ vmovups %xmm0, -0x60(%r8)
+ vmovups %xmm0, -0x50(%r8)
+ vmovups %xmm0, -0x40(%r8)
+ vmovups %xmm0, -0x30(%r8)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_128bytes):
+ xor %esi, %esi
+ mov %ecx, %esi
+ shl $16, %ecx
+ cmp $64, %edx
+ jb L(less_64bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, 0x20(%rdi)
+ vmovups %xmm0, 0x30(%rdi)
+ vmovups %xmm0, -0x40(%r8)
+ vmovups %xmm0, -0x30(%r8)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_64bytes):
+ orl %esi, %ecx
+ mov %ecx, %esi
+ cmp $32, %edx
+ jb L(less_32bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, 0x10(%rdi)
+ vmovups %xmm0, -0x20(%r8)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_32bytes):
+ shl $32, %rcx
+ cmp $16, %edx
+ jb L(less_16bytes)
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm0, -0x10(%r8)
+ ret
+ ALIGN(4)
+L(less_16bytes):
+ or %rsi, %rcx
+ cmp $8, %edx
+ jb L(less_8bytes)
+ mov %rcx, (%rdi)
+ mov %rcx, -0x08(%r8)
+ ret
+ ALIGN(4)
+L(less_8bytes):
+ cmp $4, %edx
+ jb L(less_4bytes)
+ mov %ecx, (%rdi)
+ mov %ecx, -0x04(%r8)
+ ALIGN(4)
+L(less_4bytes):
+ cmp $2, %edx
+ jb L(less_2bytes)
+ mov %cx, (%rdi)
+ mov %cx, -0x02(%r8)
+ ret
+ ALIGN(4)
+L(less_2bytes):
+ cmp $1, %edx
+ jb L(less_1bytes)
+ mov %cl, (%rdi)
+L(less_1bytes):
+ ret
+
+ ALIGN(4)
+L(256bytesormore):
+ vinserti128 $1, %xmm0, %ymm0, %ymm0
+ vmovups %ymm0, (%rdi)
+ mov %rdi, %r9
+ and $-0x20, %rdi
+ add $32, %rdi
+ sub %rdi, %r9
+ add %r9, %rdx
+ cmp $4096, %rdx
+ ja L(gobble_data)
+
+ sub $0x80, %rdx
+L(gobble_128_loop):
+ prefetcht0 0x1c0(%rdi)
+ vmovaps %ymm0, (%rdi)
+ prefetcht0 0x280(%rdi)
+ vmovaps %ymm0, 0x20(%rdi)
+ vmovaps %ymm0, 0x40(%rdi)
+ vmovaps %ymm0, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_128_loop)
+ vmovups %ymm0, -0x80(%r8)
+ vmovups %ymm0, -0x60(%r8)
+ vmovups %ymm0, -0x40(%r8)
+ vmovups %ymm0, -0x20(%r8)
+ vzeroupper
+ ret
+
+ ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %r9
+#else
+ mov __x86_shared_cache_size_half(%rip), %r9
+#endif
+ shl $4, %r9
+ cmp %r9, %rdx
+ ja L(gobble_big_data)
+ mov %rax, %r9
+ mov %esi, %eax
+ mov %rdx, %rcx
+ rep stosb
+ mov %r9, %rax
+ vzeroupper
+ ret
+
+ ALIGN(4)
+L(gobble_big_data):
+ sub $0x80, %rdx
+L(gobble_big_data_loop):
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm0, 0x20(%rdi)
+ vmovntdq %ymm0, 0x40(%rdi)
+ vmovntdq %ymm0, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_big_data_loop)
+ vmovups %ymm0, -0x80(%r8)
+ vmovups %ymm0, -0x60(%r8)
+ vmovups %ymm0, -0x40(%r8)
+ vmovups %ymm0, -0x20(%r8)
+ vzeroupper
+ sfence
+ ret
+
+END (MEMSET)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000..c15a80a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,59 @@
+/* Multiple versions of memset
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memset_sse2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ leaq __memset_avx2(%rip), %rax
+2: ret
+END(memset)
+#endif
+
+#if !defined NOT_IN_libc
+# undef memset
+# define memset __memset_sse2
+
+# undef __memset_chk
+# define __memset_chk __memset_chk_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+ The speedup we get from using GPR instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
new file mode 100644
index 0000000..2dee256
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -0,0 +1,44 @@
+/* Multiple versions of __memset_chk
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2010-2012 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ENTRY(__memset_chk)
+ .type __memset_chk, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memset_chk_sse2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ leaq __memset_chk_avx2(%rip), %rax
+2: ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+# include "../memset_chk.S"
+# endif
+#endif
--
1.8.1.4