This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction


There is a mistake with year in Copyright.

> Copyright (C) 2010 Free Software Foundation, Inc.

You don't need to redefine macros which are defined in sysdep.h

+#ifndef L
+# define L(label)      .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc   .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)   \
+       .type name,  @function; \
+       .globl name;    \
+       ALIGN(4);       \
+name:  \
+       cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)     \
+       cfi_endproc;    \
+       .size name, .-name
+#endif

Why do you use sometimes "ret" and sometimes "retq"? It looks confusing.

+       ret
+       ALIGN(4)
+L(less_64bytes):
+       cmp     $32, %edx
+       jb      L(less_32bytes)
+       vmovups (%rsi), %xmm0
+       vmovups 0x10(%rsi), %xmm1
+       vmovups -0x20(%r8), %xmm6
+       vmovups -0x10(%r8), %xmm7
+       vmovups %xmm0, (%rdi)
+       vmovups %xmm1, 0x10(%rdi)
+       vmovups %xmm6, -0x20(%r9)
+       vmovups %xmm7, -0x10(%r9)
+       retq


Which of instructions you use are from AVX2 and not from AVX set? If
no any I think you should rename from AVX2 to AVX.


--
Liubov

On Tue, Jul 9, 2013 at 11:11 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> On Mon, Jul 08, 2013 at 11:53:24AM +0800, Ling Ma wrote:
>> Attached memcpy_profile result for  __mempcpy_avx2_unaligned.
>>
>> Thanks
>> Ling
>>
>>
>> 2013/7/8, ling.ma.program@gmail.com <ling.ma.program@gmail.com>:
>> > From: Ma Ling <ling.ml@alibaba-inc.com>
>> >
>> > In this version we manage to avoid branch instructions, and force
>> > destination to be aligned
>> > with avx2 instruction. We modified gcc.403 so that we can only measure
>> > memcpy function,
>> > gcc.403 benchmarks indicate the version improved performance from 4% to 16%
>> > on different cases .
>> >
>> > Ondra, I will send out results from your memcpy_profile.
>> >
> Nice, it improved performance and patch at glance it loogs good. I will review it when glibc freeze will end.
>
>> > Best Regards
>> > Ling
>> > ---
>> >  sysdeps/x86_64/multiarch/Makefile                 |   5 +-
>> >  sysdeps/x86_64/multiarch/ifunc-defines.sym        |   2 +
>> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c        |  11 +
>> >  sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S  | 438
>> > ++++++++++++++++++++++
>> >  sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S |   4 +
>> >  sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S |   4 +
>> >  6 files changed, 462 insertions(+), 2 deletions(-)
>> >  create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/Makefile
>> > b/sysdeps/x86_64/multiarch/Makefile
>> > index dd6c27d..02c0a2a 100644
>> > --- a/sysdeps/x86_64/multiarch/Makefile
>> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> > @@ -8,8 +8,9 @@ ifeq ($(subdir),string)
>> >
>> >  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
>> > \
>> >                strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
>> > -              memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> > -              memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>> > +              memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back
>> > \
>> > +              memcpy-avx2-unaligned mempcpy-avx2-unaligned memmove-avx2-unaligned \
>> > +              strcasestr-nonascii strcasecmp_l-ssse3 \
>> >                strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
>> >                strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>> >                strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> > b/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> > index eb1538a..448b8c4 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> > @@ -17,4 +17,6 @@ FEATURE_OFFSET            offsetof (struct cpu_features, feature)
>> >  FEATURE_SIZE               sizeof (unsigned int)
>> >
>> >  COMMON_CPUID_INDEX_1
>> > +COMMON_CPUID_INDEX_7
>> >  FEATURE_INDEX_1
>> > +FEATURE_INDEX_7
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > index 332a60d..5fb5663 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > @@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >                           __memmove_chk_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
>> >                           __memmove_chk_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX2,
>> > +                     __memmove_chk_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
>> >                           __memmove_chk_sse2))
>> >
>> > @@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >                           __memmove_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
>> >                           __memmove_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX2,
>> > +                     __memmove_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
>> >
>> >    /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
>> > @@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >                           __memcpy_chk_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
>> >                           __memcpy_chk_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX2,
>> > +                         __memcpy_chk_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
>> >                           __memcpy_chk_sse2))
>> >
>> > @@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >           IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
>> >                           __memcpy_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX2,
>> > __memcpy_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>> >
>> >    /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
>> > @@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >                           __mempcpy_chk_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
>> >                           __mempcpy_chk_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX2,
>> > +                         __mempcpy_chk_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
>> >                           __mempcpy_chk_sse2))
>> >
>> > @@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct
>> > libc_ifunc_impl *array,
>> >                           __mempcpy_ssse3_back)
>> >           IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
>> >                           __mempcpy_ssse3)
>> > +         IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX2,
>> > +                         __mempcpy_avx2_unaligned)
>> >           IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
>> >
>> >    /* Support sysdeps/x86_64/multiarch/strlen.S.  */
>> > diff --git a/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
>> > b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
>> > new file mode 100644
>> > index 0000000..d32cfad
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
>> > @@ -0,0 +1,438 @@
>> > +/* memcpy with AVX2
>> > +   Copyright (C) 2010 Free Software Foundation, Inc.
>> > +   Contributed by Intel Corporation.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <http://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <sysdep.h>
>> > +
>> > +#if !defined NOT_IN_libc \
>> > +    && (defined SHARED \
>> > +        || defined USE_AS_MEMMOVE \
>> > +   || !defined USE_MULTIARCH)
>> > +
>> > +#include "asm-syntax.h"
>> > +
>> > +#ifndef MEMCPY
>> > +# define MEMCPY    __memcpy_avx2_unaligned
>> > +# define MEMCPY_CHK        __memcpy_chk_avx2_unaligned
>> > +#endif
>> > +
>> > +#ifndef L
>> > +# define L(label)  .L##label
>> > +#endif
>> > +
>> > +#ifndef ALIGN
>> > +# define ALIGN(n)  .p2align n
>> > +#endif
>> > +
>> > +#ifndef cfi_startproc
>> > +# define cfi_startproc     .cfi_startproc
>> > +#endif
>> > +
>> > +#ifndef cfi_endproc
>> > +# define cfi_endproc       .cfi_endproc
>> > +#endif
>> > +
>> > +#ifndef ENTRY
>> > +# define ENTRY(name)       \
>> > +   .type name,  @function; \
>> > +   .globl name;    \
>> > +   ALIGN(4);       \
>> > +name:      \
>> > +   cfi_startproc
>> > +#endif
>> > +
>> > +#ifndef END
>> > +# define END(name) \
>> > +   cfi_endproc;    \
>> > +   .size name, .-name
>> > +#endif
>> > +
>> > +   .section .text.avx2,"ax",@progbits
>> > +#if !defined USE_AS_BCOPY
>> > +ENTRY (MEMCPY_CHK)
>> > +   cmpq    %rdx, %rcx
>> > +   jb      HIDDEN_JUMPTARGET (__chk_fail)
>> > +END (MEMCPY_CHK)
>> > +#endif
>> > +
>> > +ENTRY (MEMCPY)
>> > +   vzeroupper
>> > +   mov     %rdi, %rax
>> > +
>> > +#ifdef USE_AS_MEMPCPY
>> > +   add     %rdx, %rax
>> > +#endif
>> > +
>> > +   lea     (%rsi, %rdx), %r8
>> > +   lea     (%rdi, %rdx), %r9
>> > +   cmp     $256, %rdx
>> > +   ja      L(256bytesormore)
>> > +   cmp     $128, %edx
>> > +   jb      L(less_128bytes)
>> > +   vmovups (%rsi), %xmm0
>> > +   vmovups 0x10(%rsi), %xmm1
>> > +   vmovups 0x20(%rsi), %xmm2
>> > +   vmovups 0x30(%rsi), %xmm3
>> > +   vmovups 0x40(%rsi), %xmm4
>> > +   vmovups 0x50(%rsi), %xmm5
>> > +   vmovups 0x60(%rsi), %xmm6
>> > +   vmovups 0x70(%rsi), %xmm7
>> > +   vmovups -0x80(%r8), %xmm8
>> > +   vmovups -0x70(%r8), %xmm9
>> > +   vmovups -0x60(%r8), %xmm10
>> > +   vmovups -0x50(%r8), %xmm11
>> > +   vmovups -0x40(%r8), %xmm12
>> > +   vmovups -0x30(%r8), %xmm13
>> > +   vmovups -0x20(%r8), %xmm14
>> > +   vmovups -0x10(%r8), %xmm15
>> > +   vmovups %xmm0, (%rdi)
>> > +   vmovups %xmm1, 0x10(%rdi)
>> > +   vmovups %xmm2, 0x20(%rdi)
>> > +   vmovups %xmm3, 0x30(%rdi)
>> > +   vmovups %xmm4, 0x40(%rdi)
>> > +   vmovups %xmm5, 0x50(%rdi)
>> > +   vmovups %xmm6, 0x60(%rdi)
>> > +   vmovups %xmm7, 0x70(%rdi)
>> > +   vmovups %xmm8, -0x80(%r9)
>> > +   vmovups %xmm9, -0x70(%r9)
>> > +   vmovups %xmm10, -0x60(%r9)
>> > +   vmovups %xmm11, -0x50(%r9)
>> > +   vmovups %xmm12, -0x40(%r9)
>> > +   vmovups %xmm13, -0x30(%r9)
>> > +   vmovups %xmm14, -0x20(%r9)
>> > +   vmovups %xmm15, -0x10(%r9)
>> > +   ret
>> > +   ALIGN(4)
>> > +L(less_128bytes):
>> > +   cmp     $64, %edx
>> > +   jb      L(less_64bytes)
>> > +   vmovups (%rsi), %xmm0
>> > +   vmovups 0x10(%rsi), %xmm1
>> > +   vmovups 0x20(%rsi), %xmm2
>> > +   vmovups 0x30(%rsi), %xmm3
>> > +   vmovups -0x40(%r8), %xmm4
>> > +   vmovups -0x30(%r8), %xmm5
>> > +   vmovups -0x20(%r8), %xmm6
>> > +   vmovups -0x10(%r8), %xmm7
>> > +   vmovups %xmm0, (%rdi)
>> > +   vmovups %xmm1, 0x10(%rdi)
>> > +   vmovups %xmm2, 0x20(%rdi)
>> > +   vmovups %xmm3, 0x30(%rdi)
>> > +   vmovups %xmm4, -0x40(%r9)
>> > +   vmovups %xmm5, -0x30(%r9)
>> > +   vmovups %xmm6, -0x20(%r9)
>> > +   vmovups %xmm7, -0x10(%r9)
>> > +   ret
>> > +   ALIGN(4)
>> > +L(less_64bytes):
>> > +   cmp     $32, %edx
>> > +   jb      L(less_32bytes)
>> > +   vmovups (%rsi), %xmm0
>> > +   vmovups 0x10(%rsi), %xmm1
>> > +   vmovups -0x20(%r8), %xmm6
>> > +   vmovups -0x10(%r8), %xmm7
>> > +   vmovups %xmm0, (%rdi)
>> > +   vmovups %xmm1, 0x10(%rdi)
>> > +   vmovups %xmm6, -0x20(%r9)
>> > +   vmovups %xmm7, -0x10(%r9)
>> > +   retq
>> > +   ALIGN(4)
>> > +L(less_32bytes):
>> > +   cmp     $16, %edx
>> > +   jb      L(less_16bytes)
>> > +   vmovups (%rsi), %xmm0
>> > +   vmovups -0x10(%r8), %xmm7
>> > +   vmovups %xmm0, (%rdi)
>> > +   vmovups %xmm7, -0x10(%r9)
>> > +   retq
>> > +   ALIGN(4)
>> > +L(less_16bytes):
>> > +   cmp     $8, %edx
>> > +   jb      L(less_8bytes)
>> > +   movq (%rsi),    %rcx
>> > +   movq -0x08(%r8),        %r10
>> > +   movq %rcx, (%rdi)
>> > +   movq %r10, -0x08(%r9)
>> > +   retq
>> > +   ALIGN(4)
>> > +L(less_8bytes):
>> > +   cmp     $4, %edx
>> > +   jb      L(less_4bytes)
>> > +   mov (%rsi),     %ecx
>> > +   mov -0x04(%r8), %edx
>> > +   mov %ecx, (%rdi)
>> > +   mov %edx, -0x04(%r9)
>> > +   ret
>> > +   ALIGN(4)
>> > +L(less_4bytes):
>> > +   cmp     $2, %edx
>> > +   jb      L(less_2bytes)
>> > +   mov (%rsi),     %cx
>> > +   mov -0x02(%r8), %dx
>> > +   mov %cx, (%rdi)
>> > +   mov %dx, -0x02(%r9)
>> > +   ret
>> > +   ALIGN(4)
>> > +L(less_2bytes):
>> > +   cmp     $1, %rdx
>> > +   jb      L(less_0bytes)
>> > +   mov     (%rsi), %cl
>> > +   mov     %cl,    (%rdi)
>> > +L(less_0bytes):
>> > +   retq
>> > +
>> > +   ALIGN(4)
>> > +L(256bytesormore):
>> > +
>> > +#ifdef USE_AS_MEMMOVE
>> > +   cmp     %rsi, %rdi
>> > +   jae     L(copy_backward)
>> > +#endif
>> > +   cmp     $2048, %rdx
>> > +   jae     L(gobble_data_movsb)
>> > +
>> > +   vmovups -0x80(%r8), %xmm8
>> > +   vmovups -0x70(%r8), %xmm9
>> > +   vmovups -0x60(%r8), %xmm10
>> > +   vmovups -0x50(%r8), %xmm11
>> > +   vmovups -0x40(%r8), %xmm12
>> > +   vmovups -0x30(%r8), %xmm13
>> > +   vmovups -0x20(%r8), %xmm14
>> > +   vmovups -0x10(%r8), %xmm15
>> > +   vmovups (%rsi), %ymm4
>> > +   mov     %rdi, %r10
>> > +   and     $-32, %rdi
>> > +   add     $32, %rdi
>> > +   mov     %rdi, %r11
>> > +   sub     %r10, %r11
>> > +   sub     %r11, %rdx
>> > +   add     %r11, %rsi
>> > +   sub     $0x80, %rdx
>> > +L(goble_128_loop):
>> > +   vmovups (%rsi), %ymm0
>> > +   vmovups 0x20(%rsi), %ymm1
>> > +   vmovups 0x40(%rsi), %ymm2
>> > +   vmovups 0x60(%rsi), %ymm3
>> > +   lea     0x80(%rsi), %rsi
>> > +   vmovaps %ymm0, (%rdi)
>> > +   vmovaps %ymm1, 0x20(%rdi)
>> > +   vmovaps %ymm2, 0x40(%rdi)
>> > +   vmovaps %ymm3, 0x60(%rdi)
>> > +   lea     0x80(%rdi), %rdi
>> > +   sub     $0x80, %rdx
>> > +   jae     L(goble_128_loop)
>> > +   vmovups %ymm4, (%r10)
>> > +   vzeroupper
>> > +   vmovups %xmm8, -0x80(%r9)
>> > +   vmovups %xmm9, -0x70(%r9)
>> > +   vmovups %xmm10, -0x60(%r9)
>> > +   vmovups %xmm11, -0x50(%r9)
>> > +   vmovups %xmm12, -0x40(%r9)
>> > +   vmovups %xmm13, -0x30(%r9)
>> > +   vmovups %xmm14, -0x20(%r9)
>> > +   vmovups %xmm15, -0x10(%r9)
>> > +   ret
>> > +
>> > +L(gobble_data_movsb):
>> > +
>> > +#ifdef SHARED_CACHE_SIZE_HALF
>> > +   mov     $SHARED_CACHE_SIZE_HALF, %rcx
>> > +#else
>> > +   mov     __x86_64_shared_cache_size_half(%rip), %rcx
>> > +#endif
>> > +   shl     $3, %rcx
>> > +
>> > +#ifdef USE_AS_MEMMOVE
>> > +   mov     %rsi, %r10
>> > +   sub     %rdi, %r10
>> > +   cmp     %rdx, %r10
>> > +   jae     L(memmove_use_memcpy_fwd)
>> > +   cmp     %rcx, %r10
>> > +   jae     L(memmove_use_memcpy_fwd)
>> > +   jmp L(gobble_mem_fwd_llc_start)
>> > +L(memmove_use_memcpy_fwd):
>> > +#endif
>> > +
>> > +   cmp     %rcx, %rdx
>> > +   ja      L(gobble_big_data_fwd)
>> > +
>> > +#ifdef USE_AS_MEMMOVE
>> > +L(gobble_mem_fwd_llc_start):
>> > +#endif
>> > +   mov     %rdx, %rcx
>> > +   rep     movsb
>> > +   ret
>> > +
>> > +L(gobble_big_data_fwd):
>> > +   vmovups (%rsi), %ymm4
>> > +   vmovups -0x80(%r8), %xmm5
>> > +   vmovups -0x70(%r8), %xmm6
>> > +   vmovups -0x60(%r8), %xmm7
>> > +   vmovups -0x50(%r8), %xmm8
>> > +   vmovups -0x40(%r8), %xmm9
>> > +   vmovups -0x30(%r8), %xmm10
>> > +   vmovups -0x20(%r8), %xmm11
>> > +   vmovups -0x10(%r8), %xmm12
>> > +   mov     %rdi, %r8
>> > +   and     $-32, %rdi
>> > +   add     $32, %rdi
>> > +   mov     %rdi, %r10
>> > +   sub     %r8, %r10
>> > +   sub     %r10, %rdx
>> > +   add     %r10, %rsi
>> > +   sub     $0x80, %rdx
>> > +L(gobble_mem_fwd_loop):
>> > +   prefetcht0 0x1c0(%rsi)
>> > +   prefetcht0 0x280(%rsi)
>> > +   vmovups (%rsi), %xmm0
>> > +   vmovups 0x10(%rsi), %xmm1
>> > +   vmovups 0x20(%rsi), %xmm2
>> > +   vmovups 0x30(%rsi), %xmm3
>> > +   vmovntdq        %xmm0, (%rdi)
>> > +   vmovntdq        %xmm1, 0x10(%rdi)
>> > +   vmovntdq        %xmm2, 0x20(%rdi)
>> > +   vmovntdq        %xmm3, 0x30(%rdi)
>> > +   vmovups 0x40(%rsi), %xmm0
>> > +   vmovups 0x50(%rsi), %xmm1
>> > +   vmovups 0x60(%rsi), %xmm2
>> > +   vmovups 0x70(%rsi), %xmm3
>> > +   lea     0x80(%rsi), %rsi
>> > +   vmovntdq        %xmm0, 0x40(%rdi)
>> > +   vmovntdq        %xmm1, 0x50(%rdi)
>> > +   vmovntdq        %xmm2, 0x60(%rdi)
>> > +   vmovntdq        %xmm3, 0x70(%rdi)
>> > +   lea     0x80(%rdi), %rdi
>> > +   sub     $0x80, %rdx
>> > +   jae     L(gobble_mem_fwd_loop)
>> > +   sfence
>> > +   vmovups %ymm4, (%r8)
>> > +   vzeroupper
>> > +   vmovups %xmm5, -0x80(%r9)
>> > +   vmovups %xmm6, -0x70(%r9)
>> > +   vmovups %xmm7, -0x60(%r9)
>> > +   vmovups %xmm8, -0x50(%r9)
>> > +   vmovups %xmm9, -0x40(%r9)
>> > +   vmovups %xmm10, -0x30(%r9)
>> > +   vmovups %xmm11, -0x20(%r9)
>> > +   vmovups %xmm12, -0x10(%r9)
>> > +   ret
>> > +
>> > +   ALIGN (4)
>> > +L(copy_backward):
>> > +#ifdef SHARED_CACHE_SIZE_HALF
>> > +   mov     $SHARED_CACHE_SIZE_HALF, %rcx
>> > +#else
>> > +   mov     __x86_64_shared_cache_size_half(%rip), %rcx
>> > +#endif
>> > +   shl     $3, %rcx
>> > +   vmovups (%rsi), %xmm8
>> > +   vmovups 0x10(%rsi), %xmm9
>> > +   vmovups 0x20(%rsi), %xmm10
>> > +   vmovups 0x30(%rsi), %xmm11
>> > +   vmovups 0x40(%rsi), %xmm12
>> > +   vmovups 0x50(%rsi), %xmm13
>> > +   vmovups 0x60(%rsi), %xmm14
>> > +   vmovups 0x70(%rsi), %xmm15
>> > +   mov     %rdi, %r9
>> > +   add     %rdx, %rsi
>> > +   add     %rdx, %rdi
>> > +   vmovups -0x20(%rsi), %ymm4
>> > +   lea     -0x20(%rdi), %r10
>> > +   mov %rdi, %r11
>> > +   and     $0x1f, %r11
>> > +   xor     %r11, %rdi
>> > +   sub     %r11, %rsi
>> > +   sub     %r11, %rdx
>> > +#ifdef USE_AS_MEMMOVE
>> > +   mov     %rdi, %r11
>> > +   sub     %rsi, %r11
>> > +   cmp     %rdx, %r11
>> > +   jae     L(memmove_use_memcpy_bwd)
>> > +   cmp     %rcx, %r11
>> > +   jae     L(memmove_use_memcpy_bwd)
>> > +   jmp L(gobble_mem_bwd_llc_start)
>> > +#endif
>> > +L(memmove_use_memcpy_bwd):
>> > +   cmp     %rcx, %rdx
>> > +   ja      L(gobble_big_data_bwd)
>> > +L(gobble_mem_bwd_llc_start):
>> > +   sub     $0x80, %rdx
>> > +L(gobble_mem_bwd_llc):
>> > +   vmovups -0x20(%rsi), %ymm0
>> > +   vmovups -0x40(%rsi), %ymm1
>> > +   vmovups -0x60(%rsi), %ymm2
>> > +   vmovups -0x80(%rsi), %ymm3
>> > +   lea     -0x80(%rsi), %rsi
>> > +   vmovaps %ymm0, -0x20(%rdi)
>> > +   vmovaps %ymm1, -0x40(%rdi)
>> > +   vmovaps %ymm2, -0x60(%rdi)
>> > +   vmovaps %ymm3, -0x80(%rdi)
>> > +   lea     -0x80(%rdi), %rdi
>> > +   sub     $0x80, %rdx
>> > +   jae     L(gobble_mem_bwd_llc)
>> > +   vmovups %ymm4, (%r10)
>> > +   vzeroupper
>> > +   vmovups %xmm8, (%r9)
>> > +   vmovups %xmm9, 0x10(%r9)
>> > +   vmovups %xmm10, 0x20(%r9)
>> > +   vmovups %xmm11, 0x30(%r9)
>> > +   vmovups %xmm12, 0x40(%r9)
>> > +   vmovups %xmm13, 0x50(%r9)
>> > +   vmovups %xmm14, 0x60(%r9)
>> > +   vmovups %xmm15, 0x70(%r9)
>> > +   ret
>> > +
>> > +L(gobble_big_data_bwd):
>> > +   sub     $0x80, %rdx
>> > +L(gobble_mem_bwd_loop):
>> > +   prefetcht0 -0x1c0(%rsi)
>> > +   prefetcht0 -0x280(%rsi)
>> > +   vmovups -0x10(%rsi), %xmm0
>> > +   vmovups -0x20(%rsi), %xmm1
>> > +   vmovups -0x30(%rsi), %xmm2
>> > +   vmovups -0x40(%rsi), %xmm3
>> > +   vmovntdq        %xmm0, -0x10(%rdi)
>> > +   vmovntdq        %xmm1, -0x20(%rdi)
>> > +   vmovntdq        %xmm2, -0x30(%rdi)
>> > +   vmovntdq        %xmm3, -0x40(%rdi)
>> > +   vmovups -0x50(%rsi), %xmm0
>> > +   vmovups -0x60(%rsi), %xmm1
>> > +   vmovups -0x70(%rsi), %xmm2
>> > +   vmovups -0x80(%rsi), %xmm3
>> > +   lea     -0x80(%rsi), %rsi
>> > +   vmovntdq        %xmm0, -0x50(%rdi)
>> > +   vmovntdq        %xmm1, -0x60(%rdi)
>> > +   vmovntdq        %xmm2, -0x70(%rdi)
>> > +   vmovntdq        %xmm3, -0x80(%rdi)
>> > +   lea     -0x80(%rdi), %rdi
>> > +   sub     $0x80, %rdx
>> > +   jae     L(gobble_mem_bwd_loop)
>> > +   sfence
>> > +   vmovups %ymm4, (%r10)
>> > +   vzeroupper
>> > +   vmovups %xmm8, (%r9)
>> > +   vmovups %xmm9, 0x10(%r9)
>> > +   vmovups %xmm10, 0x20(%r9)
>> > +   vmovups %xmm11, 0x30(%r9)
>> > +   vmovups %xmm12, 0x40(%r9)
>> > +   vmovups %xmm13, 0x50(%r9)
>> > +   vmovups %xmm14, 0x60(%r9)
>> > +   vmovups %xmm15, 0x70(%r9)
>> > +   ret
>> > +END (MEMCPY)
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
>> > b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
>> > new file mode 100644
>> > index 0000000..ddb2090
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
>> > @@ -0,0 +1,4 @@
>> > +#define USE_AS_MEMMOVE
>> > +#define MEMCPY             __memmove_avx2_unaligned
>> > +#define MEMCPY_CHK __memmove_chk_avx2_unaligned
>> > +#include "memcpy-avx2-unaligned.S"
>> > diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
>> > b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
>> > new file mode 100644
>> > index 0000000..a2f4af9
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
>> > @@ -0,0 +1,4 @@
>> > +#define USE_AS_MEMPCPY
>> > +#define MEMCPY             __mempcpy_avx2_unaligned
>> > +#define MEMCPY_CHK __mempcpy_chk_avx2_unaligned
>> > +#include "memcpy-avx2-unaligned.S"
>> > --
>> > 1.8.1.4
>> >
>> >
>
>
>
> --
>
> Well fix that in the next (upgrade, update, patch release, service pack).


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]