This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi Liubov Dmitrieva We used memcpy-ssse3-back.S as a base because it resolved memory false dependence: https://lkml.org/lkml/2010/11/1/464, and almost reach previous processors limit for memcpy on L1: 16 bytes per cycle. I append comparison result of gcc test-memcpy file, among memcpy_sse2_unaligned memcpy_ssse3_back, memcpy_ssse3, and our patch memcpy_avx2. The result shows our patch improve performance up to 2X and reached 32bytes per cycle on L1, which is also processor limit on haswell for memcpy. Nehalem can accept unaligned cases, but SandyBridge, IvyBridge, Haswell are sensitive in unaligned cases. Thanks Ling 2013/6/5, Dmitrieva Liubov <liubov.dmitrieva@gmail.com>: > Hello, > > It looks like you used memcpy-ssse3.S as a base to make avx2 version > but there is recent sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S > version which was the best on Core i7. > > Why don't you use this? > > -- > Liubov Dmitrieva > Intel Corporation > > On Wed, Jun 5, 2013 at 1:23 PM, <ling.ma.program@gmail.com> wrote: >> From: Ling <ling.ml@alibaba-inc.com> >> >> This patch includes optimized 64bit memcpy/memmove for Corei7 with avx2 >> instruction. >> It improves memcpy by up to 2X on Corei7, and memmove by up to 2x as >> well. >> >> Any comments are appreciated. >> >> Thanks >> Ling >> --- >> sysdeps/x86_64/multiarch/Makefile | 4 +- >> sysdeps/x86_64/multiarch/ifunc-defines.sym | 2 + >> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 11 + >> sysdeps/x86_64/multiarch/memcpy-avx2.S | 4732 >> ++++++++++++++++++++++++++++ >> sysdeps/x86_64/multiarch/memcpy.S | 5 +- >> sysdeps/x86_64/multiarch/memcpy_chk.S | 3 + >> sysdeps/x86_64/multiarch/memmove-avx2.S | 4 + >> sysdeps/x86_64/multiarch/memmove.c | 7 +- >> sysdeps/x86_64/multiarch/memmove_chk.c | 6 +- >> sysdeps/x86_64/multiarch/mempcpy-avx2.S | 4 + >> sysdeps/x86_64/multiarch/mempcpy.S | 3 + >> sysdeps/x86_64/multiarch/mempcpy_chk.S | 3 + >> 12 files changed, 4778 insertions(+), 6 deletions(-) >> create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx2.S >> create mode 100644 sysdeps/x86_64/multiarch/memmove-avx2.S >> create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx2.S >> >> diff --git a/sysdeps/x86_64/multiarch/Makefile >> b/sysdeps/x86_64/multiarch/Makefile >> index dd6c27d..f836231 100644 >> --- a/sysdeps/x86_64/multiarch/Makefile >> +++ b/sysdeps/x86_64/multiarch/Makefile >> @@ -9,7 +9,9 @@ ifeq ($(subdir),string) >> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 >> strncmp-ssse3 \ >> strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ >> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ >> - memmove-ssse3-back strcasestr-nonascii >> strcasecmp_l-ssse3 \ >> + memmove-ssse3-back \ >> + memcpy-avx2 mempcpy-avx2 memmove-avx2 \ >> + strcasestr-nonascii strcasecmp_l-ssse3 \ >> strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf >> memset-x86-64 \ >> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 >> \ >> strcpy-sse2-unaligned strncpy-sse2-unaligned \ >> diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym >> b/sysdeps/x86_64/multiarch/ifunc-defines.sym >> index eb1538a..448b8c4 100644 >> --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym >> +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym >> @@ -17,4 +17,6 @@ FEATURE_OFFSET offsetof (struct >> cpu_features, feature) >> FEATURE_SIZE sizeof (unsigned int) >> >> COMMON_CPUID_INDEX_1 >> +COMMON_CPUID_INDEX_7 >> FEATURE_INDEX_1 >> +FEATURE_INDEX_7 >> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> index 332a60d..7f3b573 100644 >> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> @@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> __memmove_chk_ssse3_back) >> IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3, >> __memmove_chk_ssse3) >> + IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX2, >> + __memmove_chk_avx2) >> IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, >> __memmove_chk_sse2)) >> >> @@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> __memmove_ssse3_back) >> IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3, >> __memmove_ssse3) >> + IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX2, >> + __memmove_avx2) >> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) >> >> /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ >> @@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> __memcpy_chk_ssse3_back) >> IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3, >> __memcpy_chk_ssse3) >> + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX2, >> + __memcpy_chk_avx2) >> IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, >> __memcpy_chk_sse2)) >> >> @@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, >> __memcpy_ssse3_back) >> IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, >> __memcpy_ssse3) >> + IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX2, __memcpy_avx2) >> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) >> >> /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ >> @@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> __mempcpy_chk_ssse3_back) >> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3, >> __mempcpy_chk_ssse3) >> + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX2, >> + __mempcpy_chk_avx2) >> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, >> __mempcpy_chk_sse2)) >> >> @@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct >> libc_ifunc_impl *array, >> __mempcpy_ssse3_back) >> IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3, >> __mempcpy_ssse3) >> + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX2, >> + __mempcpy_avx2) >> IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) >> >> /* Support sysdeps/x86_64/multiarch/strlen.S. */ >> diff --git a/sysdeps/x86_64/multiarch/memcpy-avx2.S >> b/sysdeps/x86_64/multiarch/memcpy-avx2.S >> new file mode 100644 >> index 0000000..b2245fe >> --- /dev/null >> +++ b/sysdeps/x86_64/multiarch/memcpy-avx2.S >> @@ -0,0 +1,4732 @@ >> +/* memcpy with AVX2 >> + Copyright (C) 2010 Free Software Foundation, Inc. >> + Contributed by Intel Corporation. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> + >> +#if !defined NOT_IN_libc \ >> + && (defined SHARED \ >> + || defined USE_AS_MEMMOVE \ >> + || !defined USE_MULTIARCH) >> + >> +#include "asm-syntax.h" >> + >> +#ifndef MEMCPY >> +# define MEMCPY __memcpy_avx2 >> +# define MEMCPY_CHK __memcpy_chk_avx2 >> +#endif >> + >> +#ifndef L >> +# define L(label) .L##label >> +#endif >> + >> +#ifndef ALIGN >> +# define ALIGN(n) .p2align n >> +#endif >> + >> +#ifndef cfi_startproc >> +# define cfi_startproc .cfi_startproc >> +#endif >> + >> +#ifndef cfi_endproc >> +# define cfi_endproc .cfi_endproc >> +#endif >> + >> +#ifndef ENTRY >> +# define ENTRY(name) \ >> + .type name, @function; \ >> + .globl name; \ >> + ALIGN(4); \ >> +name: \ >> + cfi_startproc >> +#endif >> + >> +#ifndef END >> +# define END(name) \ >> + cfi_endproc; \ >> + .size name, .-name >> +#endif >> + >> +#define JMPTBL(I, B) (I - B) >> + >> +/* Branch to an entry in a jump table. TABLE is a jump table with >> + relative offsets. INDEX is a register contains the index.longo the >> + jump table. SCALE is the scale of INDEX. */ >> + >> +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ >> + lea TABLE(%rip), %r11; \ >> + movslq (%r11, INDEX, SCALE), INDEX; \ >> + add %r11, INDEX; \ >> + jmp *INDEX; \ >> + ud2 >> + >> +/* The route handle all loop tail to wartch it eixsts or not, >> + if yes, guarantee left data is fewer than 128-bytes. */ >> + >> +#define HANDLE_256_LOOP_TAIL(LOOP, REPEAT, START, OPT_1, OPT_2, TABLE) \ >> + OPT_1 %r9, %rsi; \ >> + OPT_1 %r9, %rdi; \ >> + sub %r9, %rdx; \ >> + jae LOOP; \ >> + add %r9, %rdx; \ >> + cmp $0x80, %rdx; \ >> + jae REPEAT; \ >> + OPT_1 %rdx, %rsi; \ >> + OPT_1 %rdx, %rdi; \ >> + vmovups %ymm9, (%r8); \ >> + BRANCH_TO_JMPTBL_ENTRY (TABLE, %rdx, 4); \ >> +REPEAT: \ >> + OPT_2 $0x80, %rsi; \ >> + OPT_2 $0x80, %rdi; \ >> + sub $0x80, %rdx; \ >> + jmp START; \ >> + ud2 >> + >> + .section .text.avx2,"ax",@progbits >> +#if !defined USE_AS_BCOPY >> +ENTRY (MEMCPY_CHK) >> + cmpq %rdx, %rcx >> + jb HIDDEN_JUMPTARGET (__chk_fail) >> +END (MEMCPY_CHK) >> +#endif >> + >> +ENTRY (MEMCPY) >> + mov %rdi, %rax >> +#ifdef USE_AS_MEMPCPY >> + add %rdx, %rax >> +#endif >> + >> +#ifdef USE_AS_MEMMOVE >> + cmp %rsi, %rdi >> + jb L(copy_forward) >> + je L(bwd_write_0bytes) >> + cmp $160, %rdx >> + jae L(copy_backward) >> + BRANCH_TO_JMPTBL_ENTRY (L(table_159_bytes_bwd), %rdx, 4) >> +L(copy_forward): >> +#endif >> + cmp $160, %rdx >> + jae L(160bytesormore) >> + >> +#ifndef USE_AS_MEMMOVE >> + cmp %dil, %sil >> + jbe L(bk_write) >> +#endif >> + add %rdx, %rsi >> + add %rdx, %rdi >> + BRANCH_TO_JMPTBL_ENTRY (L(table_159_bytes_fwd), %rdx, 4) >> +#ifndef USE_AS_MEMMOVE >> +L(bk_write): >> + >> + BRANCH_TO_JMPTBL_ENTRY (L(table_159_bytes_bwd), %rdx, 4) >> +#endif >> + >> + ALIGN (5) >> +L(160bytesormore): >> + >> +#ifndef USE_AS_MEMMOVE >> + cmp %dil, %sil >> + jl L(copy_backward) >> +#endif >> + >> +#ifdef DATA_CACHE_SIZE_HALF >> + mov $DATA_CACHE_SIZE_HALF, %r9 >> +#else >> + mov __x86_64_data_cache_size_half(%rip), %r9 >> +#endif >> + shl $2, %r9 >> + cmp %r9, %rdx >> + jae L(gobble_mem_fwd) >> + vmovups (%rsi), %ymm9 >> + mov %rdi, %r8 >> + and $-32, %rdi >> + add $32, %rdi >> + mov %rdi, %r9 >> + sub %r8, %r9 >> + sub %r9, %rdx >> + add %r9, %rsi >> + mov $0x100, %r9 /* each loop handle 256 bytes */ >> + mov %rsi, %rcx >> + and $0x3f, %rcx >> + test $0xf, %rcx >> + jz L(shl_16_aligned_fwd) >> + xor %r10, %r10 >> + sub %r9, %rdx >> + jae L(above_0x100_fwd) >> + cmp $0x20, %rcx >> + mov $0x0e, %r11 /* the offset for 33 ~ 63 forward copy >> cases */ >> + mov $0x40, %r10 /* the offset for 01 ~ 31 forward copy >> cases */ >> + cmova %r11, %r10 >> + sub $0x80, %rsi >> + sub $0x80, %rdi >> + add $0x80, %rdx >> +L(above_0x100_fwd): >> + lea L(shl_table_fwd)(%rip), %r11 >> + movslq (%r11, %rcx, 4), %rcx >> + add %r11, %rcx >> + add %r10, %rcx >> + jmp *%rcx >> + ud2 >> + >> + ALIGN (5) >> +L(copy_backward): >> + >> +#ifdef DATA_CACHE_SIZE_HALF >> + mov $DATA_CACHE_SIZE_HALF, %r9 >> +#else >> + mov __x86_64_data_cache_size_half(%rip), %r9 >> +#endif >> + shl $2, %r9 >> + cmp %r9, %rdx >> + jae L(gobble_mem_bwd) >> + add %rdx, %rsi >> + add %rdx, %rdi >> + vmovups -0x20(%rsi), %ymm9 >> + lea -0x20(%rdi), %r8 >> + mov %rdi, %r9 >> + and $0x1f, %r9 >> + xor %r9, %rdi >> + sub %r9, %rsi >> + sub %r9, %rdx >> + mov $0x100, %r9 /* each loop handle 256 bytes */ >> + mov %rsi, %rcx >> + and $0x3f, %rcx >> + xor %r10, %r10 >> + test $0xf, %rcx >> + jz L(shl_16_aligned_bwd) >> + sub %r9, %rdx >> + jae L(above_0x100_bwd) >> + cmp $0x20, %rcx >> + mov $0x0e, %r11 /* the offset for 33 ~ 63 backward copy >> cases */ >> + mov $0x42, %r10 /* the offset for 01 ~ 31 backward copy >> cases */ >> + cmova %r11, %r10 >> + add $0x80, %rsi >> + add $0x80, %rdi >> + add $0x80, %rdx >> +L(above_0x100_bwd): >> + lea L(shl_table_bwd)(%rip), %r11 >> + movslq (%r11, %rcx, 4), %rcx >> + add %r11, %rcx >> + add %r10, %rcx >> + jmp *%rcx >> + ud2 >> + >> + ALIGN (4) >> +L(shl_16_aligned_fwd): >> + test $0x1f, %rcx >> + jz L(shl_32) >> + test $0x20, %rcx >> + jz L(shl_16) >> +L(shl_48): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + sub %r9, %rdx >> + jae L(shl_16_sub_32) >> + sub $0x80, %rsi >> + sub $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_16_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_16): >> + sub %r9, %rdx >> + jae L(shl_16_loop) >> + sub $0x80, %rsi >> + sub $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_16_loop_less_256) >> + ud2 >> + ALIGN (5) >> +L(shl_16_loop): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_16_sub_32): >> + vmovaps 0x30(%rsi), %ymm1 >> + vperm2f128 $3, 0x10(%rsi), %ymm1, %ymm2 >> + vmovaps %ymm2, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm3 >> + vmovaps %ymm3, 0x40(%rdi) >> + vmovaps 0x70(%rsi), %ymm4 >> + vperm2f128 $3, 0x50(%rsi), %ymm4, %ymm5 >> + vmovaps %ymm5, 0x60(%rdi) >> +L(shl_16_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_16_loop_less_256_sub_32): >> + vmovaps 0xb0(%rsi), %ymm1 >> + vperm2f128 $3, 0x90(%rsi), %ymm1, %ymm2 >> + vmovaps %ymm2, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm3 >> + vmovaps %ymm3, 0xc0(%rdi) >> + vmovaps 0xf0(%rsi), %ymm4 >> + vperm2f128 $3, 0xd0(%rsi), %ymm4, %ymm5 >> + vmovaps %ymm5, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_16_loop), >> L(shl_16_loop_tail_over_128), >> + L(shl_16_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (5) >> +L(shl_0):L(shl_32): >> + sub %r9, %rdx >> + jae L(shl_32_loop) >> + sub $0x80, %rsi >> + sub $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_32_loop_less_256) >> + ALIGN (5) >> +L(shl_32_loop): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> + vmovaps 0x20(%rsi), %ymm1 >> + vmovaps %ymm1, 0x20(%rdi) >> + vmovaps 0x40(%rsi), %ymm2 >> + vmovaps %ymm2, 0x40(%rdi) >> + vmovaps 0x60(%rsi), %ymm3 >> + vmovaps %ymm3, 0x60(%rdi) >> +L(shl_32_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> + vmovaps 0xa0(%rsi), %ymm1 >> + vmovaps %ymm1, 0xa0(%rdi) >> + vmovaps 0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, 0xc0(%rdi) >> + vmovaps 0xe0(%rsi), %ymm3 >> + vmovaps %ymm3, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_32_loop), >> L(shl_32_loop_tail_over_128), >> + L(shl_32_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_49): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_17_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_17_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_17): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_17_sub_32): >> + vmovaps 0x2f(%rsi), %ymm1 >> + vperm2f128 $3, 0xf(%rsi), %ymm1, %ymm2 >> + vpalignr $1, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6f(%rsi), %ymm5 >> + vperm2f128 $3, 0x4f(%rsi), %ymm5, %ymm6 >> + vpalignr $1, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_17_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_17_loop_less_256_sub_32): >> + vmovaps 0xaf(%rsi), %ymm1 >> + vperm2f128 $3, 0x8f(%rsi), %ymm1, %ymm2 >> + vpalignr $1, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xef(%rsi), %ymm5 >> + vperm2f128 $3, 0xcf(%rsi), %ymm5, %ymm6 >> + vpalignr $1, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_17), L(shl_17_loop_tail_over_128), >> + L(shl_17_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_33): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_1_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_1_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_1): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_1_sub_32): >> + vmovaps 0x1f(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3f(%rsi), %ymm1, %ymm2 >> + vpalignr $1, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5f(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7f(%rsi), %ymm5, %ymm6 >> + vpalignr $1, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_1_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_1_loop_less_256_sub_32): >> + vmovaps 0x9f(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xbf(%rsi), %ymm1, %ymm2 >> + vpalignr $1, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xdf(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xff(%rsi), %ymm5, %ymm6 >> + vpalignr $1, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_1), L(shl_1_loop_tail_over_128), >> + L(shl_1_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_50): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_18_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_18_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_18): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_18_sub_32): >> + vmovaps 0x2e(%rsi), %ymm1 >> + vperm2f128 $3, 0xe(%rsi), %ymm1, %ymm2 >> + vpalignr $2, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6e(%rsi), %ymm5 >> + vperm2f128 $3, 0x4e(%rsi), %ymm5, %ymm6 >> + vpalignr $2, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_18_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_18_loop_less_256_sub_32): >> + vmovaps 0xae(%rsi), %ymm1 >> + vperm2f128 $3, 0x8e(%rsi), %ymm1, %ymm2 >> + vpalignr $2, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xee(%rsi), %ymm5 >> + vperm2f128 $3, 0xce(%rsi), %ymm5, %ymm6 >> + vpalignr $2, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_18), L(shl_18_loop_tail_over_128), >> + L(shl_18_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_34): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_2_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_2_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_2): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_2_sub_32): >> + vmovaps 0x1e(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3e(%rsi), %ymm1, %ymm2 >> + vpalignr $2, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5e(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7e(%rsi), %ymm5, %ymm6 >> + vpalignr $2, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_2_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_2_loop_less_256_sub_32): >> + vmovaps 0x9e(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xbe(%rsi), %ymm1, %ymm2 >> + vpalignr $2, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xde(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xfe(%rsi), %ymm5, %ymm6 >> + vpalignr $2, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_2), L(shl_2_loop_tail_over_128), >> + L(shl_2_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_51): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_19_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_19_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_19): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_19_sub_32): >> + vmovaps 0x2d(%rsi), %ymm1 >> + vperm2f128 $3, 0xd(%rsi), %ymm1, %ymm2 >> + vpalignr $3, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6d(%rsi), %ymm5 >> + vperm2f128 $3, 0x4d(%rsi), %ymm5, %ymm6 >> + vpalignr $3, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_19_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_19_loop_less_256_sub_32): >> + vmovaps 0xad(%rsi), %ymm1 >> + vperm2f128 $3, 0x8d(%rsi), %ymm1, %ymm2 >> + vpalignr $3, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xed(%rsi), %ymm5 >> + vperm2f128 $3, 0xcd(%rsi), %ymm5, %ymm6 >> + vpalignr $3, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_19), L(shl_19_loop_256_again), >> + L(shl_19_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_35): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_3_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_3_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_3): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_3_sub_32): >> + vmovaps 0x1d(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3d(%rsi), %ymm1, %ymm2 >> + vpalignr $3, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5d(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7d(%rsi), %ymm5, %ymm6 >> + vpalignr $3, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_3_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_3_loop_less_256_sub_32): >> + vmovaps 0x9d(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xbd(%rsi), %ymm1, %ymm2 >> + vpalignr $3, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xdd(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xfd(%rsi), %ymm5, %ymm6 >> + vpalignr $3, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_3), L(shl_3_loop_tail_over_128), >> + L(shl_3_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_52): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_20_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_20_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_20): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_20_sub_32): >> + vmovaps 0x2c(%rsi), %ymm1 >> + vperm2f128 $3, 0xc(%rsi), %ymm1, %ymm2 >> + vpalignr $4, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6c(%rsi), %ymm5 >> + vperm2f128 $3, 0x4c(%rsi), %ymm5, %ymm6 >> + vpalignr $4, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_20_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_20_loop_less_256_sub_32): >> + vmovaps 0xac(%rsi), %ymm1 >> + vperm2f128 $3, 0x8c(%rsi), %ymm1, %ymm2 >> + vpalignr $4, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xec(%rsi), %ymm5 >> + vperm2f128 $3, 0xcc(%rsi), %ymm5, %ymm6 >> + vpalignr $4, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_20), L(shl_20_less_256_again), >> + L(shl_20_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_36): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_4_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_4_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_4): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_4_sub_32): >> + vmovaps 0x1c(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3c(%rsi), %ymm1, %ymm2 >> + vpalignr $4, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5c(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7c(%rsi), %ymm5, %ymm6 >> + vpalignr $4, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_4_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_4_loop_less_256_sub_32): >> + vmovaps 0x9c(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xbc(%rsi), %ymm1, %ymm2 >> + vpalignr $4, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xdc(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xfc(%rsi), %ymm5, %ymm6 >> + vpalignr $4, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_4), L(shl_4_loop_tail_over_128), >> + L(shl_4_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_53): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_21_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_21_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_21): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_21_sub_32): >> + vmovaps 0x2b(%rsi), %ymm1 >> + vperm2f128 $3, 0xb(%rsi), %ymm1, %ymm2 >> + vpalignr $5, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6b(%rsi), %ymm5 >> + vperm2f128 $3, 0x4b(%rsi), %ymm5, %ymm6 >> + vpalignr $5, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_21_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_21_loop_less_256_sub_32): >> + vmovaps 0xab(%rsi), %ymm1 >> + vperm2f128 $3, 0x8b(%rsi), %ymm1, %ymm2 >> + vpalignr $5, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xeb(%rsi), %ymm5 >> + vperm2f128 $3, 0xcb(%rsi), %ymm5, %ymm6 >> + vpalignr $5, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_21), L(shl_21_loop_tail_over_128), >> + L(shl_21_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_37): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_5_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_5_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_5): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_5_sub_32): >> + vmovaps 0x1b(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3b(%rsi), %ymm1, %ymm2 >> + vpalignr $5, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5b(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7b(%rsi), %ymm5, %ymm6 >> + vpalignr $5, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_5_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_5_loop_less_256_sub_32): >> + vmovaps 0x9b(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xbb(%rsi), %ymm1, %ymm2 >> + vpalignr $5, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xdb(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xfb(%rsi), %ymm5, %ymm6 >> + vpalignr $5, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_5), L(shl_5_loop_tail_over_128), >> + L(shl_5_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_54): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_22_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_22_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_22): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_22_sub_32): >> + vmovaps 0x2a(%rsi), %ymm1 >> + vperm2f128 $3, 0xa(%rsi), %ymm1, %ymm2 >> + vpalignr $6, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x6a(%rsi), %ymm5 >> + vperm2f128 $3, 0x4a(%rsi), %ymm5, %ymm6 >> + vpalignr $6, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_22_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_22_loop_less_256_sub_32): >> + vmovaps 0xaa(%rsi), %ymm1 >> + vperm2f128 $3, 0x8a(%rsi), %ymm1, %ymm2 >> + vpalignr $6, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xea(%rsi), %ymm5 >> + vperm2f128 $3, 0xca(%rsi), %ymm5, %ymm6 >> + vpalignr $6, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_22), L(shl_22_loop_tail_over_128), >> + L(shl_22_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_38): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_6_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_6_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_6): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_6_sub_32): >> + vmovaps 0x1a(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x3a(%rsi), %ymm1, %ymm2 >> + vpalignr $6, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x5a(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x7a(%rsi), %ymm5, %ymm6 >> + vpalignr $6, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_6_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_6_loop_less_256_sub_32): >> + vmovaps 0x9a(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xba(%rsi), %ymm1, %ymm2 >> + vpalignr $6, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xda(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xfa(%rsi), %ymm5, %ymm6 >> + vpalignr $6, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_6), >> L(shl_6_bit_16_zero_32_loop_again), >> + L(shl_6_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_55): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_23_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_23_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_23): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_23_sub_32): >> + vmovaps 0x29(%rsi), %ymm1 >> + vperm2f128 $3, 0x9(%rsi), %ymm1, %ymm2 >> + vpalignr $7, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x69(%rsi), %ymm5 >> + vperm2f128 $3, 0x49(%rsi), %ymm5, %ymm6 >> + vpalignr $7, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_23_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_23_loop_less_256_sub_32): >> + vmovaps 0xa9(%rsi), %ymm1 >> + vperm2f128 $3, 0x89(%rsi), %ymm1, %ymm2 >> + vpalignr $7, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe9(%rsi), %ymm5 >> + vperm2f128 $3, 0xc9(%rsi), %ymm5, %ymm6 >> + vpalignr $7, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_23), L(shl_23_loop_tail_over_128), >> + L(shl_23_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_39): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_7_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_7_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_7): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_7_sub_32): >> + vmovaps 0x19(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x39(%rsi), %ymm1, %ymm2 >> + vpalignr $7, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x59(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x79(%rsi), %ymm5, %ymm6 >> + vpalignr $7, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_7_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_7_loop_less_256_sub_32): >> + vmovaps 0x99(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb9(%rsi), %ymm1, %ymm2 >> + vpalignr $7, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd9(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf9(%rsi), %ymm5, %ymm6 >> + vpalignr $7, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_7), >> L(shl_7_bit_16_zero_32_loop_again), >> + L(shl_7_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_56): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_24_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_24_loop_less_256_sub_32) >> + ud2 >> + ALIGN (4) >> +L(shl_24): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_24_sub_32): >> + vmovaps 0x28(%rsi), %ymm1 >> + vperm2f128 $3, 0x8(%rsi), %ymm1, %ymm2 >> + vpalignr $8, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x68(%rsi), %ymm5 >> + vperm2f128 $3, 0x48(%rsi), %ymm5, %ymm6 >> + vpalignr $8, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_24_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_24_loop_less_256_sub_32): >> + vmovaps 0xa8(%rsi), %ymm1 >> + vperm2f128 $3, 0x88(%rsi), %ymm1, %ymm2 >> + vpalignr $8, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe8(%rsi), %ymm5 >> + vperm2f128 $3, 0xc8(%rsi), %ymm5, %ymm6 >> + vpalignr $8, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_24), L(shl_24_loop_tail_over_128), >> + L(shl_24_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_40): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_8_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_8_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_8): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_8_sub_32): >> + vmovaps 0x18(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x38(%rsi), %ymm1, %ymm2 >> + vpalignr $8, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x58(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x78(%rsi), %ymm5, %ymm6 >> + vpalignr $8, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_8_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_8_loop_less_256_sub_32): >> + vmovaps 0x98(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb8(%rsi), %ymm1, %ymm2 >> + vpalignr $8, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd8(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf8(%rsi), %ymm5, %ymm6 >> + vpalignr $8, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_8), >> L(shl_8_bit_16_zero_32_loop_again), >> + L(shl_8_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_57): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_25_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_25_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_25): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_25_sub_32): >> + vmovaps 0x27(%rsi), %ymm1 >> + vperm2f128 $3, 0x7(%rsi), %ymm1, %ymm2 >> + vpalignr $9, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x67(%rsi), %ymm5 >> + vperm2f128 $3, 0x47(%rsi), %ymm5, %ymm6 >> + vpalignr $9, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_25_loop_less_256): >> + vmovups 0x80(%rsi), %ymm4 >> + vmovaps %ymm4, 0x80(%rdi) >> +L(shl_25_loop_less_256_sub_32): >> + vmovaps 0xa7(%rsi), %ymm1 >> + vperm2f128 $3, 0x87(%rsi), %ymm1, %ymm2 >> + vpalignr $9, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe7(%rsi), %ymm5 >> + vperm2f128 $3, 0xc7(%rsi), %ymm5, %ymm6 >> + vpalignr $9, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_25), L(shl_25_loop_tail_over_128), >> + L(shl_25_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_41): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_9_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_9_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_9): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_9_sub_32): >> + vmovaps 0x17(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x37(%rsi), %ymm1, %ymm2 >> + vpalignr $9, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x57(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x77(%rsi), %ymm5, %ymm6 >> + vpalignr $9, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_9_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_9_loop_less_256_sub_32): >> + vmovaps 0x97(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb7(%rsi), %ymm1, %ymm2 >> + vpalignr $9, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd7(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf7(%rsi), %ymm5, %ymm6 >> + vpalignr $9, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_9), >> L(shl_9_bit_16_zero_32_loop_again), >> + L(shl_9_loop_less_256), add, sub, L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_58): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_26_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_26_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_26): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_26_sub_32): >> + vmovaps 0x26(%rsi), %ymm1 >> + vperm2f128 $3, 0x6(%rsi), %ymm1, %ymm2 >> + vpalignr $10, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x66(%rsi), %ymm5 >> + vperm2f128 $3, 0x46(%rsi), %ymm5, %ymm6 >> + vpalignr $10, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_26_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_26_loop_less_256_sub_32): >> + vmovaps 0xa6(%rsi), %ymm1 >> + vperm2f128 $3, 0x86(%rsi), %ymm1, %ymm2 >> + vpalignr $10, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe6(%rsi), %ymm5 >> + vperm2f128 $3, 0xc6(%rsi), %ymm5, %ymm6 >> + vpalignr $10, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_26), L(shl_26_loop_tail_over_128), >> + L(shl_26_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_42): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_10_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_10_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_10): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_10_sub_32): >> + vmovaps 0x16(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x36(%rsi), %ymm1, %ymm2 >> + vpalignr $10, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x56(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x76(%rsi), %ymm5, %ymm6 >> + vpalignr $10, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_10_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_10_loop_less_256_sub_32): >> + vmovaps 0x96(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb6(%rsi), %ymm1, %ymm2 >> + vpalignr $10, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd6(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf6(%rsi), %ymm5, %ymm6 >> + vpalignr $10, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_10), >> L(shl_10_bit_16_zero_32_loop_again), >> + L(shl_10_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_59): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_27_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_27_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_27): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_27_sub_32): >> + vmovaps 0x25(%rsi), %ymm1 >> + vperm2f128 $3, 0x5(%rsi), %ymm1, %ymm2 >> + vpalignr $11, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x65(%rsi), %ymm5 >> + vperm2f128 $3, 0x45(%rsi), %ymm5, %ymm6 >> + vpalignr $11, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_27_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_27_loop_less_256_sub_32): >> + vmovaps 0xa5(%rsi), %ymm1 >> + vperm2f128 $3, 0x85(%rsi), %ymm1, %ymm2 >> + vpalignr $11, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe5(%rsi), %ymm5 >> + vperm2f128 $3, 0xc5(%rsi), %ymm5, %ymm6 >> + vpalignr $11, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_27), L(shl_27_loop_tail_over_128), >> + L(shl_27_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_43): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_11_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_11_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_11): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_11_sub_32): >> + vmovaps 0x15(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x35(%rsi), %ymm1, %ymm2 >> + vpalignr $11, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x55(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x75(%rsi), %ymm5, %ymm6 >> + vpalignr $11, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_11_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_11_loop_less_256_sub_32): >> + vmovaps 0x95(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb5(%rsi), %ymm1, %ymm2 >> + vpalignr $11, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd5(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf5(%rsi), %ymm5, %ymm6 >> + vpalignr $11, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_11), >> L(shl_11_bit_16_zero_32_loop_again), >> + L(shl_11_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_60): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_28_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_28_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_28): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_28_sub_32): >> + vmovaps 0x24(%rsi), %ymm1 >> + vperm2f128 $3, 0x4(%rsi), %ymm1, %ymm2 >> + vpalignr $12, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x64(%rsi), %ymm5 >> + vperm2f128 $3, 0x44(%rsi), %ymm5, %ymm6 >> + vpalignr $12, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_28_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_28_loop_less_256_sub_32): >> + vmovaps 0xa4(%rsi), %ymm1 >> + vperm2f128 $3, 0x84(%rsi), %ymm1, %ymm2 >> + vpalignr $12, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe4(%rsi), %ymm5 >> + vperm2f128 $3, 0xc4(%rsi), %ymm5, %ymm6 >> + vpalignr $12, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_28), L(shl_28_loop_tail_over_128), >> + L(shl_28_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_44): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_12_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_12_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_12): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_12_sub_32): >> + vmovaps 0x14(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x34(%rsi), %ymm1, %ymm2 >> + vpalignr $12, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x54(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x74(%rsi), %ymm5, %ymm6 >> + vpalignr $12, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_12_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_12_loop_less_256_sub_32): >> + vmovaps 0x94(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb4(%rsi), %ymm1, %ymm2 >> + vpalignr $12, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd4(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf4(%rsi), %ymm5, %ymm6 >> + vpalignr $12, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_12), >> L(shl_12_bit_16_zero_32_loop_again), >> + L(shl_12_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_61): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_29_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_29_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_29): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_29_sub_32): >> + vmovaps 0x23(%rsi), %ymm1 >> + vperm2f128 $3, 0x3(%rsi), %ymm1, %ymm2 >> + vpalignr $13, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x63(%rsi), %ymm5 >> + vperm2f128 $3, 0x43(%rsi), %ymm5, %ymm6 >> + vpalignr $13, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_29_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_29_loop_less_256_sub_32): >> + vmovaps 0xa3(%rsi), %ymm1 >> + vperm2f128 $3, 0x83(%rsi), %ymm1, %ymm2 >> + vpalignr $13, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe3(%rsi), %ymm5 >> + vperm2f128 $3, 0xc3(%rsi), %ymm5, %ymm6 >> + vpalignr $13, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_29), L(shl_29_loop_tail_over_128), >> + L(shl_29_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_45): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_13_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_13_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_13): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_13_sub_32): >> + vmovaps 0x13(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x33(%rsi), %ymm1, %ymm2 >> + vpalignr $13, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x53(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x73(%rsi), %ymm5, %ymm6 >> + vpalignr $13, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_13_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_13_loop_less_256_sub_32): >> + vmovaps 0x93(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb3(%rsi), %ymm1, %ymm2 >> + vpalignr $13, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd3(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf3(%rsi), %ymm5, %ymm6 >> + vpalignr $13, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_13), >> L(shl_13_bit_16_zero_32_loop_again), >> + L(shl_13_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_62): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_30_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_30_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_30): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_30_sub_32): >> + vmovaps 0x22(%rsi), %ymm1 >> + vperm2f128 $3, 0x2(%rsi), %ymm1, %ymm2 >> + vpalignr $14, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x62(%rsi), %ymm5 >> + vperm2f128 $3, 0x42(%rsi), %ymm5, %ymm6 >> + vpalignr $14, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_30_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_30_loop_less_256_sub_32): >> + vmovaps 0xa2(%rsi), %ymm1 >> + vperm2f128 $3, 0x82(%rsi), %ymm1, %ymm2 >> + vpalignr $14, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe2(%rsi), %ymm5 >> + vperm2f128 $3, 0xc2(%rsi), %ymm5, %ymm6 >> + vpalignr $14, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_30), L(shl_30_loop_tail_over_128), >> + L(shl_30_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_46): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_14_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_14_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_14): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_14_sub_32): >> + vmovaps 0x12(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x32(%rsi), %ymm1, %ymm2 >> + vpalignr $14, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x52(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x72(%rsi), %ymm5, %ymm6 >> + vpalignr $14, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_14_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_14_loop_less_256_sub_32): >> + vmovaps 0x92(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb2(%rsi), %ymm1, %ymm2 >> + vpalignr $14, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd2(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf2(%rsi), %ymm5, %ymm6 >> + vpalignr $14, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_14), >> L(shl_14_bit_16_zero_32_loop_again), >> + L(shl_14_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_63): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_31_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_31_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_31): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_31_sub_32): >> + vmovaps 0x21(%rsi), %ymm1 >> + vperm2f128 $3, 0x1(%rsi), %ymm1, %ymm2 >> + vpalignr $15, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x61(%rsi), %ymm5 >> + vperm2f128 $3, 0x41(%rsi), %ymm5, %ymm6 >> + vpalignr $15, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_31_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_31_loop_less_256_sub_32): >> + vmovaps 0xa1(%rsi), %ymm1 >> + vperm2f128 $3, 0x81(%rsi), %ymm1, %ymm2 >> + vpalignr $15, %ymm2, %ymm1, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xe1(%rsi), %ymm5 >> + vperm2f128 $3, 0xc1(%rsi), %ymm5, %ymm6 >> + vpalignr $15, %ymm6, %ymm5, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_31), L(shl_31_loop_tail_over_128), >> + L(shl_31_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_47): >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_15_sub_32) >> + sub $0x20, %rdi >> + sub $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_15_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_15): >> + vmovups (%rsi), %ymm0 >> + vmovaps %ymm0, (%rdi) >> +L(shl_15_sub_32): >> + vmovaps 0x11(%rsi), %ymm1 >> + vperm2f128 $0x21, 0x31(%rsi), %ymm1, %ymm2 >> + vpalignr $15, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0x20(%rdi) >> + vmovups 0x40(%rsi), %ymm4 >> + vmovaps %ymm4, 0x40(%rdi) >> + vmovaps 0x51(%rsi), %ymm5 >> + vperm2f128 $0x21, 0x71(%rsi), %ymm5, %ymm6 >> + vpalignr $15, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0x60(%rdi) >> +L(shl_15_loop_less_256): >> + vmovups 0x80(%rsi), %ymm0 >> + vmovaps %ymm0, 0x80(%rdi) >> +L(shl_15_loop_less_256_sub_32): >> + vmovaps 0x91(%rsi), %ymm1 >> + vperm2f128 $0x21, 0xb1(%rsi), %ymm1, %ymm2 >> + vpalignr $15, %ymm1, %ymm2, %ymm3 >> + vmovaps %ymm3, 0xa0(%rdi) >> + vmovups 0xc0(%rsi), %ymm4 >> + vmovaps %ymm4, 0xc0(%rdi) >> + vmovaps 0xd1(%rsi), %ymm5 >> + vperm2f128 $0x21, 0xf1(%rsi), %ymm5, %ymm6 >> + vpalignr $15, %ymm5, %ymm6, %ymm7 >> + vmovaps %ymm7, 0xe0(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_15), >> L(shl_15_bit_16_zero_32_loop_again), >> + L(shl_15_loop_less_256), add, sub, >> L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_16_aligned_bwd): >> + test $0x1f, %rcx >> + jz L(shl_32_bwd) >> + test $0x20, %rcx >> + jz L(shl_16_bwd) >> +L(shl_48_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + sub %r9, %rdx >> + jae L(shl_16_bwd_sub_32) >> + add $0x80, %rsi >> + add $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_16_bwd_loop_less_256_sub_32) >> + ud2 >> + ALIGN (5) >> +L(shl_16_bwd): >> + sub %r9, %rdx >> + jae L(shl_16_bwd_loop) >> + add $0x80, %rsi >> + add $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_16_bwd_loop_less_256) >> + ALIGN (5) >> +L(shl_16_bwd_loop): >> + vmovaps -0x30(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x10(%rsi), %ymm0, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_16_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x70(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x50(%rsi), %ymm0, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_16_bwd_loop_less_256): >> + vmovaps -0xb0(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x90(%rsi), %ymm0, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_16_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xf0(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xd0(%rsi), %ymm0, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_16_bwd_loop), >> L(shl_16_bwd_loop_tail_over_128), >> + L(shl_16_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (5) >> +L(shl_0_bwd):L(shl_32_bwd): >> + sub %r9, %rdx >> + jae L(shl_32_bwd_loop) >> + add $0x80, %rsi >> + add $0x80, %rdi >> + add $0x80, %rdx >> + jmp L(shl_32_bwd_loop_less_256) >> + ALIGN (5) >> +L(shl_32_bwd_loop): >> + vmovaps -0x20(%rsi), %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> + vmovaps -0x40(%rsi), %ymm1 >> + vmovaps %ymm1, -0x40(%rdi) >> + vmovaps -0x60(%rsi), %ymm2 >> + vmovaps %ymm2, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm3 >> + vmovaps %ymm3, -0x80(%rdi) >> +L(shl_32_bwd_loop_less_256): >> + vmovups -0xa0(%rsi), %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> + vmovaps -0xc0(%rsi), %ymm1 >> + vmovaps %ymm1, -0xc0(%rdi) >> + vmovaps -0xe0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xe0(%rdi) >> + vmovaps -0x100(%rsi), %ymm3 >> + vmovaps %ymm3, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_32_bwd_loop), >> L(shl_32_bwd_loop_tail_over_128), >> + L(shl_32_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_33_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_1_bwd_sub_32) >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_1_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_1_bwd): >> + vmovaps -0x21(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x01(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_1_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x61(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x41(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_1_bwd_loop_less_256): >> + vmovaps -0xa1(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x81(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_1_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe1(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc1(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_1_bwd), >> L(shl_1_bwd_loop_tail_over_128), >> + L(shl_1_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_49_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_17_bwd_32) >> +L(shl_49_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_17_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_17_bwd): >> + vmovaps -0x11(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x31(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_17_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x51(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x71(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_17_bwd_loop_less_256): >> + vmovaps -0x91(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb1(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_17_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd1(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf1(%rsi), %ymm0, %ymm1 >> + vpalignr $1, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_17_bwd), >> L(shl_17_bwd_loop_tail_over_128), >> + L(shl_17_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_34_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_2_bwd_sub_32) >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_2_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_2_bwd): >> + vmovaps -0x22(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x02(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_2_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x62(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x42(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_2_bwd_loop_less_256): >> + vmovaps -0xa2(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x82(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_2_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe2(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc2(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_2_bwd), >> L(shl_2_bwd_loop_tail_over_128), >> + L(shl_2_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_50_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_18_bwd_32) >> +L(shl_50_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_18_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_18_bwd): >> + vmovaps -0x12(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x32(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_18_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x52(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x72(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_18_bwd_loop_less_256): >> + vmovaps -0x92(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb2(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_18_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd2(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf2(%rsi), %ymm0, %ymm1 >> + vpalignr $2, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_18_bwd), >> L(shl_18_bwd_loop_tail_over_128), >> + L(shl_18_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_35_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_3_bwd_sub_32) >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_3_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_3_bwd): >> + vmovaps -0x23(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x03(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_3_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x63(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x43(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_3_bwd_loop_less_256): >> + vmovaps -0xa3(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x83(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_3_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe3(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc3(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_3_bwd), >> L(shl_3_bwd_loop_tail_over_128), >> + L(shl_3_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_51_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_19_bwd_32) >> +L(shl_51_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_19_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_19_bwd): >> + vmovaps -0x13(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x33(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_19_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x53(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x73(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_19_bwd_loop_less_256): >> + vmovaps -0x93(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb3(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_19_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd3(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf3(%rsi), %ymm0, %ymm1 >> + vpalignr $3, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_19_bwd), >> L(shl_19_bwd_loop_tail_over_128), >> + L(shl_19_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_36_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_4_bwd_sub_32) >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_4_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_4_bwd): >> + vmovaps -0x24(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x04(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_4_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x64(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x44(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_4_bwd_loop_less_256): >> + vmovaps -0xa4(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x84(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_4_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe4(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc4(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_4_bwd), >> L(shl_4_bwd_loop_tail_over_128), >> + L(shl_4_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_52_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_20_bwd_32) >> +L(shl_52_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_20_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_20_bwd): >> + vmovaps -0x14(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x34(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_20_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x54(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x74(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_20_bwd_loop_less_256): >> + vmovaps -0x94(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb4(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_20_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd4(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf4(%rsi), %ymm0, %ymm1 >> + vpalignr $4, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_20_bwd), >> L(shl_20_bwd_loop_tail_over_128), >> + L(shl_20_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_37_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_5_bwd_sub_32) >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_5_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_5_bwd): >> + vmovaps -0x25(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x05(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_5_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x65(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x45(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_5_bwd_loop_less_256): >> + vmovaps -0xa5(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x85(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_5_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe5(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc5(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_5_bwd), >> L(shl_5_bwd_loop_tail_over_128), >> + L(shl_5_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_53_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_21_bwd_32) >> +L(shl_53_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_21_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_21_bwd): >> + vmovaps -0x15(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x35(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_21_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x55(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x75(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_21_bwd_loop_less_256): >> + vmovaps -0x95(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb5(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_21_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd5(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf5(%rsi), %ymm0, %ymm1 >> + vpalignr $5, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_21_bwd), >> L(shl_21_bwd_loop_tail_over_128), >> + L(shl_21_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_38_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_6_bwd_sub_32) >> +L(shl_38_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_6_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_6_bwd): >> + vmovaps -0x26(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x06(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_6_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x66(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x46(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_6_bwd_loop_less_256): >> + vmovaps -0xa6(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x86(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_6_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe6(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc6(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_6_bwd), >> L(shl_6_bwd_loop_tail_over_128), >> + L(shl_6_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_54_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_22_bwd_32) >> +L(shl_54_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_22_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_22_bwd): >> + vmovaps -0x16(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x36(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_22_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x56(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x76(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_22_bwd_loop_less_256): >> + vmovaps -0x96(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb6(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_22_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd6(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf6(%rsi), %ymm0, %ymm1 >> + vpalignr $6, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_22_bwd), >> L(shl_22_bwd_loop_tail_over_128), >> + L(shl_22_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_39_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_7_bwd_sub_32) >> +L(shl_39_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_7_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_7_bwd): >> + vmovaps -0x27(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x07(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_7_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x67(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x47(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_7_bwd_loop_less_256): >> + vmovaps -0xa7(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x87(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_7_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe7(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc7(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_7_bwd), >> L(shl_7_bwd_loop_tail_over_128), >> + L(shl_7_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_55_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_23_bwd_32) >> +L(shl_55_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_23_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_23_bwd): >> + vmovaps -0x17(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x37(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_23_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x57(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x77(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_23_bwd_loop_less_256): >> + vmovaps -0x97(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb7(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_23_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd7(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf7(%rsi), %ymm0, %ymm1 >> + vpalignr $7, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_23_bwd), >> L(shl_23_bwd_loop_tail_over_128), >> + L(shl_23_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_40_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_8_bwd_sub_32) >> +L(shl_40_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_8_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_8_bwd): >> + vmovaps -0x28(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x08(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_8_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x68(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x48(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_8_bwd_loop_less_256): >> + vmovaps -0xa8(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x88(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_8_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe8(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc8(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_8_bwd), >> L(shl_8_bwd_loop_tail_over_128), >> + L(shl_8_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_56_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_24_bwd_32) >> +L(shl_56_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_24_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_24_bwd): >> + vmovaps -0x18(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x38(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_24_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x58(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x78(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_24_bwd_loop_less_256): >> + vmovaps -0x98(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb8(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_24_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd8(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf8(%rsi), %ymm0, %ymm1 >> + vpalignr $8, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_24_bwd), >> L(shl_24_bwd_loop_tail_over_128), >> + L(shl_24_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_41_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_9_bwd_sub_32) >> +L(shl_41_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_9_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_9_bwd): >> + vmovaps -0x29(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x09(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_9_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x69(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x49(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_9_bwd_loop_less_256): >> + vmovaps -0xa9(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x89(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_9_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xe9(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xc9(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_9_bwd), >> L(shl_9_bwd_loop_tail_over_128), >> + L(shl_9_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_57_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_25_bwd_32) >> +L(shl_57_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_25_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_25_bwd): >> + vmovaps -0x19(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x39(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_25_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x59(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x79(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_25_bwd_loop_less_256): >> + vmovaps -0x99(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xb9(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_25_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xd9(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xf9(%rsi), %ymm0, %ymm1 >> + vpalignr $9, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_25_bwd), >> L(shl_25_bwd_loop_tail_over_128), >> + L(shl_25_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_42_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_10_bwd_sub_32) >> +L(shl_42_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_10_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_10_bwd): >> + vmovaps -0x2a(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0a(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_10_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6a(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4a(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_10_bwd_loop_less_256): >> + vmovaps -0xaa(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8a(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_10_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xea(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xca(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_10_bwd), >> L(shl_10_bwd_loop_tail_over_128), >> + L(shl_10_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_58_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_26_bwd_32) >> +L(shl_58_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_26_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_26_bwd): >> + vmovaps -0x1a(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3a(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_26_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5a(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7a(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_26_bwd_loop_less_256): >> + vmovaps -0x9a(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xba(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_26_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xda(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xfa(%rsi), %ymm0, %ymm1 >> + vpalignr $10, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_26_bwd), >> L(shl_26_bwd_loop_tail_over_128), >> + L(shl_26_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_43_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_11_bwd_sub_32) >> +L(shl_43_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_11_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_11_bwd): >> + vmovaps -0x2b(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0b(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_11_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6b(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4b(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_11_bwd_loop_less_256): >> + vmovaps -0xab(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8b(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_11_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xeb(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xcb(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_11_bwd), >> L(shl_11_bwd_loop_tail_over_128), >> + L(shl_11_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_59_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_27_bwd_32) >> +L(shl_59_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_27_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_27_bwd): >> + vmovaps -0x1b(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3b(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_27_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5b(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7b(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_27_bwd_loop_less_256): >> + vmovaps -0x9b(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xbb(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_27_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xdb(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xfb(%rsi), %ymm0, %ymm1 >> + vpalignr $11, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_27_bwd), >> L(shl_27_bwd_loop_tail_over_128), >> + L(shl_27_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_44_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_12_bwd_sub_32) >> +L(shl_44_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_12_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_12_bwd): >> + vmovaps -0x2c(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0c(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_12_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6c(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4c(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_12_bwd_loop_less_256): >> + vmovaps -0xac(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8c(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_12_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xec(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xcc(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_12_bwd), >> L(shl_12_bwd_loop_tail_over_128), >> + L(shl_12_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_60_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_28_bwd_32) >> +L(shl_60_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_28_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_28_bwd): >> + vmovaps -0x1c(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3c(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_28_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5c(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7c(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_28_bwd_loop_less_256): >> + vmovaps -0x9c(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xbc(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_28_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xdc(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xfc(%rsi), %ymm0, %ymm1 >> + vpalignr $12, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_28_bwd), >> L(shl_28_bwd_loop_tail_over_128), >> + L(shl_28_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_45_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_13_bwd_sub_32) >> +L(shl_45_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_13_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_13_bwd): >> + vmovaps -0x2d(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0d(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_13_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6d(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4d(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_13_bwd_loop_less_256): >> + vmovaps -0xad(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8d(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_13_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xed(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xcd(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_13_bwd), >> L(shl_13_bwd_loop_tail_over_128), >> + L(shl_13_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_61_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_29_bwd_32) >> +L(shl_61_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_29_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_29_bwd): >> + vmovaps -0x1d(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3d(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_29_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5d(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7d(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_29_bwd_loop_less_256): >> + vmovaps -0x9d(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xbd(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_29_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xdd(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xfd(%rsi), %ymm0, %ymm1 >> + vpalignr $13, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_29_bwd), >> L(shl_29_bwd_loop_tail_over_128), >> + L(shl_29_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_46_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_14_bwd_sub_32) >> +L(shl_46_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_14_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_14_bwd): >> + vmovaps -0x2e(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0e(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_14_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6e(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4e(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_14_bwd_loop_less_256): >> + vmovaps -0xae(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8e(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_14_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xee(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xce(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_14_bwd), >> L(shl_14_bwd_loop_tail_over_128), >> + L(shl_14_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_62_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_30_bwd_32) >> +L(shl_62_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_30_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_30_bwd): >> + vmovaps -0x1e(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3e(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_30_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5e(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7e(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_30_bwd_loop_less_256): >> + vmovaps -0x9e(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xbe(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_30_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xde(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xfe(%rsi), %ymm0, %ymm1 >> + vpalignr $14, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_30_bwd), >> L(shl_30_bwd_loop_tail_over_128), >> + L(shl_30_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_47_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_15_bwd_sub_32) >> +L(shl_47_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_15_bwd_loop_less_256_sub_32) >> + ALIGN (5) >> +L(shl_15_bwd): >> + vmovaps -0x2f(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x0f(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x20(%rdi) >> +L(shl_15_bwd_sub_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x6f(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x4f(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> +L(shl_15_bwd_loop_less_256): >> + vmovaps -0xaf(%rsi), %ymm0 >> + vperm2f128 $0x21, -0x8f(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xa0(%rdi) >> +L(shl_15_bwd_loop_less_256_sub_32): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xef(%rsi), %ymm0 >> + vperm2f128 $0x21, -0xcf(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm0, %ymm1, %ymm1 >> + vmovaps %ymm1, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_15_bwd), >> L(shl_15_bwd_loop_tail_over_128), >> + L(shl_15_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(shl_63_bwd): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_31_bwd_32) >> +L(shl_63_bwd_less_256): >> + add $0x20, %rdi >> + add $0x20, %rsi >> + add $0x20, %rdx >> + jmp L(shl_31_bwd_loop_less_256_less) >> + ALIGN (5) >> +L(shl_31_bwd): >> + vmovaps -0x1f(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x3f(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x20(%rdi) >> +L(shl_31_bwd_32): >> + vmovups -0x40(%rsi), %ymm2 >> + vmovaps %ymm2, -0x40(%rdi) >> + vmovaps -0x5f(%rsi), %ymm0 >> + vperm2f128 $0x03, -0x7f(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0x60(%rdi) >> + vmovups -0x80(%rsi), %ymm2 >> + vmovaps %ymm2, -0x80(%rdi) >> + >> +L(shl_31_bwd_loop_less_256): >> + vmovaps -0x9f(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xbf(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xa0(%rdi) >> +L(shl_31_bwd_loop_less_256_less): >> + vmovups -0xc0(%rsi), %ymm2 >> + vmovaps %ymm2, -0xc0(%rdi) >> + vmovaps -0xdf(%rsi), %ymm0 >> + vperm2f128 $0x03, -0xff(%rsi), %ymm0, %ymm1 >> + vpalignr $15, %ymm1, %ymm0, %ymm0 >> + vmovaps %ymm0, -0xe0(%rdi) >> + vmovups -0x100(%rsi), %ymm2 >> + vmovaps %ymm2, -0x100(%rdi) >> + HANDLE_256_LOOP_TAIL(L(shl_31_bwd), >> L(shl_31_bwd_loop_tail_over_128), >> + L(shl_31_bwd_loop_less_256), sub, add, >> L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(gobble_mem_fwd): >> + >> +#ifdef SHARED_CACHE_SIZE_HALF >> + mov $SHARED_CACHE_SIZE_HALF, %rcx >> +#else >> + mov __x86_64_shared_cache_size_half(%rip), %rcx >> +#endif >> + vmovups (%rsi), %ymm9 >> + mov %rdi, %r8 >> + and $-32, %rdi >> + add $32, %rdi >> + mov %rdi, %r10 >> + sub %r8, %r10 >> + sub %r10, %rdx >> + add %r10, %rsi >> +#ifdef USE_AS_MEMMOVE >> + mov %rsi, %r10 >> + sub %rdi, %r10 >> + cmp %rdx, %r10 >> + jae L(memmove_use_memcpy_fwd) >> + cmp %r9, %r10 >> + jb L(ll_cache_copy_fwd_start) >> +L(memmove_use_memcpy_fwd): >> +#endif >> + add %rcx, %rcx >> + cmp %rcx, %rdx >> + ja L(bigger_in_fwd) >> + mov %rdx, %rcx >> + >> +L(bigger_in_fwd): >> + sub %rcx, %rdx >> + cmp $0x1000, %rdx >> + jbe L(ll_cache_copy_fwd) >> + mov %rcx, %r10 >> + shl $2, %r10 >> + cmp %r10, %rdx >> + jbe L(2steps_copy_fwd) >> + add %rcx, %rdx >> + xor %rcx, %rcx >> +L(2steps_copy_fwd): >> + sub $0x80, %rdx >> +L(gobble_mem_fwd_loop): >> + prefetcht0 0x1c0(%rsi) >> + prefetcht0 0x280(%rsi) >> + vmovups (%rsi), %xmm0 >> + vmovups 0x10(%rsi), %xmm1 >> + vmovups 0x20(%rsi), %xmm2 >> + vmovups 0x30(%rsi), %xmm3 >> + vmovups 0x40(%rsi), %xmm4 >> + vmovups 0x50(%rsi), %xmm5 >> + vmovups 0x60(%rsi), %xmm6 >> + vmovups 0x70(%rsi), %xmm7 >> + vmovntdq %xmm0, (%rdi) >> + vmovntdq %xmm1, 0x10(%rdi) >> + vmovntdq %xmm2, 0x20(%rdi) >> + vmovntdq %xmm3, 0x30(%rdi) >> + vmovntdq %xmm4, 0x40(%rdi) >> + vmovntdq %xmm5, 0x50(%rdi) >> + vmovntdq %xmm6, 0x60(%rdi) >> + vmovntdq %xmm7, 0x70(%rdi) >> + lea 0x80(%rsi), %rsi >> + lea 0x80(%rdi), %rdi >> + sub $0x80, %rdx >> + jae L(gobble_mem_fwd_loop) >> + sfence >> + cmp $0x80, %rcx >> + jae L(gobble_mem_fwd_end) >> + add $0x80, %rdx >> + add %rdx, %rsi >> + add %rdx, %rdi >> + vmovups %ymm9, (%r8) >> + BRANCH_TO_JMPTBL_ENTRY (L(table_159_bytes_fwd), %rdx, 4) >> +L(gobble_mem_fwd_end): >> + add $0x80, %rdx >> +L(ll_cache_copy_fwd): >> + add %rcx, %rdx >> +L(ll_cache_copy_fwd_start): >> + mov %rdx, %rcx >> + rep movsb >> + vmovups %ymm9, (%r8) >> + ret >> + >> + ALIGN (4) >> +L(gobble_mem_bwd): >> + >> +#ifdef SHARED_CACHE_SIZE_HALF >> + mov $SHARED_CACHE_SIZE_HALF, %rcx >> +#else >> + mov __x86_64_shared_cache_size_half(%rip), %rcx >> +#endif >> + add %rdx, %rsi >> + add %rdx, %rdi >> + vmovups -0x20(%rsi), %ymm9 >> + lea -0x20(%rdi), %r8 >> + mov %rdi, %r10 >> + and $0x1f, %r10 >> + xor %r10, %rdi >> + sub %r10, %rsi >> + sub %r10, %rdx >> +#ifdef USE_AS_MEMMOVE >> + mov %rdi, %r10 >> + sub %rsi, %r10 >> + cmp %rdx, %r10 >> + jae L(memmove_use_memcpy_bwd) >> + cmp %r9, %r10 >> + jb L(ll_cache_copy_bwd_start) >> +L(memmove_use_memcpy_bwd): >> +#endif >> + add %rcx, %rcx >> + cmp %rcx, %rdx >> + ja L(bigger) >> + mov %rdx, %rcx >> +L(bigger): >> + sub %rcx, %rdx >> + cmp $0x1000, %rdx >> + jbe L(ll_cache_copy) >> + mov %rcx, %r10 >> + shl $2, %r10 >> + cmp %r10, %rdx >> + jbe L(2steps_copy) >> + add %rcx, %rdx >> + xor %rcx, %rcx >> +L(2steps_copy): >> + sub $0x80, %rdx >> +L(gobble_mem_bwd_loop): >> + prefetcht0 -0x1c0(%rsi) >> + prefetcht0 -0x280(%rsi) >> + vmovups -0x10(%rsi), %xmm1 >> + vmovups -0x20(%rsi), %xmm2 >> + vmovups -0x30(%rsi), %xmm3 >> + vmovups -0x40(%rsi), %xmm4 >> + vmovups -0x50(%rsi), %xmm5 >> + vmovups -0x60(%rsi), %xmm6 >> + vmovups -0x70(%rsi), %xmm7 >> + vmovups -0x80(%rsi), %xmm8 >> + vmovntdq %xmm1, -0x10(%rdi) >> + vmovntdq %xmm2, -0x20(%rdi) >> + vmovntdq %xmm3, -0x30(%rdi) >> + vmovntdq %xmm4, -0x40(%rdi) >> + vmovntdq %xmm5, -0x50(%rdi) >> + vmovntdq %xmm6, -0x60(%rdi) >> + vmovntdq %xmm7, -0x70(%rdi) >> + vmovntdq %xmm8, -0x80(%rdi) >> + sub $0x80, %rsi >> + sub $0x80, %rdi >> + sub $0x80, %rdx >> + jae L(gobble_mem_bwd_loop) >> + sfence >> + cmp $0x80, %rcx >> + jb L(gobble_mem_bwd_end) >> + add $0x80, %rdx >> +L(ll_cache_copy): >> + add %rcx, %rdx >> +L(ll_cache_copy_bwd_start): >> + sub $0x80, %rdx >> +L(gobble_ll_loop): >> + prefetchnta -0x3c0(%rsi) >> + prefetchnta -0x400(%rsi) >> + prefetchnta -0x3c0(%rdi) >> + prefetchnta -0x400(%rdi) >> + vmovups -0x10(%rsi), %xmm1 >> + vmovups -0x20(%rsi), %xmm2 >> + vmovups -0x30(%rsi), %xmm3 >> + vmovups -0x40(%rsi), %xmm4 >> + vmovups -0x50(%rsi), %xmm5 >> + vmovups -0x60(%rsi), %xmm6 >> + vmovups -0x70(%rsi), %xmm7 >> + vmovups -0x80(%rsi), %xmm8 >> + lea -0x80(%rsi), %rsi >> + vmovaps %xmm1, -0x10(%rdi) >> + vmovaps %xmm2, -0x20(%rdi) >> + vmovaps %xmm3, -0x30(%rdi) >> + vmovaps %xmm4, -0x40(%rdi) >> + vmovaps %xmm5, -0x50(%rdi) >> + vmovaps %xmm6, -0x60(%rdi) >> + vmovaps %xmm7, -0x70(%rdi) >> + vmovaps %xmm8, -0x80(%rdi) >> + lea -0x80(%rdi), %rdi >> + sub $0x80, %rdx >> + jae L(gobble_ll_loop) >> +L(gobble_mem_bwd_end): >> + add $0x80, %rdx >> + sub %rdx, %rsi >> + sub %rdx, %rdi >> + vmovups %ymm9, (%r8) >> + BRANCH_TO_JMPTBL_ENTRY (L(table_159_bytes_bwd), %rdx, 4) >> + >> + ALIGN(4) >> +L(fwd_write_144bytes): >> + vmovups -144(%rsi), %xmm0 >> + vmovups %xmm0, -144(%rdi) >> +L(fwd_write_128bytes): >> + vmovups -128(%rsi), %xmm0 >> + vmovups %xmm0, -128(%rdi) >> +L(fwd_write_112bytes): >> + vmovups -112(%rsi), %xmm0 >> + vmovups %xmm0, -112(%rdi) >> +L(fwd_write_96bytes): >> + vmovups -96(%rsi), %xmm0 >> + vmovups %xmm0, -96(%rdi) >> +L(fwd_write_80bytes): >> + vmovups -80(%rsi), %xmm0 >> + vmovups %xmm0, -80(%rdi) >> +L(fwd_write_64bytes): >> + vmovups -64(%rsi), %xmm0 >> + vmovups %xmm0, -64(%rdi) >> +L(fwd_write_48bytes): >> + vmovups -48(%rsi), %xmm0 >> + vmovups %xmm0, -48(%rdi) >> +L(fwd_write_32bytes): >> + vmovups -32(%rsi), %xmm0 >> + vmovups %xmm0, -32(%rdi) >> +L(fwd_write_16bytes): >> + vmovups -16(%rsi), %xmm0 >> + vmovups %xmm0, -16(%rdi) >> +L(fwd_write_0bytes): >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_159bytes): >> + vmovups -159(%rsi), %xmm0 >> + vmovups %xmm0, -159(%rdi) >> +L(fwd_write_143bytes): >> + vmovups -143(%rsi), %xmm0 >> + vmovups %xmm0, -143(%rdi) >> +L(fwd_write_127bytes): >> + vmovups -127(%rsi), %xmm0 >> + vmovups %xmm0, -127(%rdi) >> +L(fwd_write_111bytes): >> + vmovups -111(%rsi), %xmm0 >> + vmovups %xmm0, -111(%rdi) >> +L(fwd_write_95bytes): >> + vmovups -95(%rsi), %xmm0 >> + vmovups %xmm0, -95(%rdi) >> +L(fwd_write_79bytes): >> + vmovups -79(%rsi), %xmm0 >> + vmovups %xmm0, -79(%rdi) >> +L(fwd_write_63bytes): >> + vmovups -63(%rsi), %xmm0 >> + vmovups %xmm0, -63(%rdi) >> +L(fwd_write_47bytes): >> + vmovups -47(%rsi), %xmm0 >> + vmovups %xmm0, -47(%rdi) >> +L(fwd_write_31bytes): >> + vmovups -31(%rsi), %xmm0 >> + vmovups %xmm0, -31(%rdi) >> +L(fwd_write_15bytes): >> + mov -15(%rsi), %rdx >> + mov -8(%rsi), %rcx >> + mov %rdx, -15(%rdi) >> + mov %rcx, -8(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_158bytes): >> + vmovups -158(%rsi), %xmm0 >> + vmovups %xmm0, -158(%rdi) >> +L(fwd_write_142bytes): >> + vmovups -142(%rsi), %xmm0 >> + vmovups %xmm0, -142(%rdi) >> +L(fwd_write_126bytes): >> + vmovups -126(%rsi), %xmm0 >> + vmovups %xmm0, -126(%rdi) >> +L(fwd_write_110bytes): >> + vmovups -110(%rsi), %xmm0 >> + vmovups %xmm0, -110(%rdi) >> +L(fwd_write_94bytes): >> + vmovups -94(%rsi), %xmm0 >> + vmovups %xmm0, -94(%rdi) >> +L(fwd_write_78bytes): >> + vmovups -78(%rsi), %xmm0 >> + vmovups %xmm0, -78(%rdi) >> +L(fwd_write_62bytes): >> + vmovups -62(%rsi), %xmm0 >> + vmovups %xmm0, -62(%rdi) >> +L(fwd_write_46bytes): >> + vmovups -46(%rsi), %xmm0 >> + vmovups %xmm0, -46(%rdi) >> +L(fwd_write_30bytes): >> + vmovups -30(%rsi), %xmm0 >> + vmovups %xmm0, -30(%rdi) >> +L(fwd_write_14bytes): >> + mov -14(%rsi), %rdx >> + mov -8(%rsi), %rcx >> + mov %rdx, -14(%rdi) >> + mov %rcx, -8(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_157bytes): >> + vmovups -157(%rsi), %xmm0 >> + vmovups %xmm0, -157(%rdi) >> +L(fwd_write_141bytes): >> + vmovups -141(%rsi), %xmm0 >> + vmovups %xmm0, -141(%rdi) >> +L(fwd_write_125bytes): >> + vmovups -125(%rsi), %xmm0 >> + vmovups %xmm0, -125(%rdi) >> +L(fwd_write_109bytes): >> + vmovups -109(%rsi), %xmm0 >> + vmovups %xmm0, -109(%rdi) >> +L(fwd_write_93bytes): >> + vmovups -93(%rsi), %xmm0 >> + vmovups %xmm0, -93(%rdi) >> +L(fwd_write_77bytes): >> + vmovups -77(%rsi), %xmm0 >> + vmovups %xmm0, -77(%rdi) >> +L(fwd_write_61bytes): >> + vmovups -61(%rsi), %xmm0 >> + vmovups %xmm0, -61(%rdi) >> +L(fwd_write_45bytes): >> + vmovups -45(%rsi), %xmm0 >> + vmovups %xmm0, -45(%rdi) >> +L(fwd_write_29bytes): >> + vmovups -29(%rsi), %xmm0 >> + vmovups %xmm0, -29(%rdi) >> +L(fwd_write_13bytes): >> + mov -13(%rsi), %rdx >> + mov -8(%rsi), %rcx >> + mov %rdx, -13(%rdi) >> + mov %rcx, -8(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_156bytes): >> + vmovups -156(%rsi), %xmm0 >> + vmovups %xmm0, -156(%rdi) >> +L(fwd_write_140bytes): >> + vmovups -140(%rsi), %xmm0 >> + vmovups %xmm0, -140(%rdi) >> +L(fwd_write_124bytes): >> + vmovups -124(%rsi), %xmm0 >> + vmovups %xmm0, -124(%rdi) >> +L(fwd_write_108bytes): >> + vmovups -108(%rsi), %xmm0 >> + vmovups %xmm0, -108(%rdi) >> +L(fwd_write_92bytes): >> + vmovups -92(%rsi), %xmm0 >> + vmovups %xmm0, -92(%rdi) >> +L(fwd_write_76bytes): >> + vmovups -76(%rsi), %xmm0 >> + vmovups %xmm0, -76(%rdi) >> +L(fwd_write_60bytes): >> + vmovups -60(%rsi), %xmm0 >> + vmovups %xmm0, -60(%rdi) >> +L(fwd_write_44bytes): >> + vmovups -44(%rsi), %xmm0 >> + vmovups %xmm0, -44(%rdi) >> +L(fwd_write_28bytes): >> + vmovups -28(%rsi), %xmm0 >> + vmovups %xmm0, -28(%rdi) >> +L(fwd_write_12bytes): >> + mov -12(%rsi), %rdx >> + mov %rdx, -12(%rdi) >> + mov -4(%rsi), %edx >> + mov %edx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_155bytes): >> + vmovups -155(%rsi), %xmm0 >> + vmovups %xmm0, -155(%rdi) >> +L(fwd_write_139bytes): >> + vmovups -139(%rsi), %xmm0 >> + vmovups %xmm0, -139(%rdi) >> +L(fwd_write_123bytes): >> + vmovups -123(%rsi), %xmm0 >> + vmovups %xmm0, -123(%rdi) >> +L(fwd_write_107bytes): >> + vmovups -107(%rsi), %xmm0 >> + vmovups %xmm0, -107(%rdi) >> +L(fwd_write_91bytes): >> + vmovups -91(%rsi), %xmm0 >> + vmovups %xmm0, -91(%rdi) >> +L(fwd_write_75bytes): >> + vmovups -75(%rsi), %xmm0 >> + vmovups %xmm0, -75(%rdi) >> +L(fwd_write_59bytes): >> + vmovups -59(%rsi), %xmm0 >> + vmovups %xmm0, -59(%rdi) >> +L(fwd_write_43bytes): >> + vmovups -43(%rsi), %xmm0 >> + vmovups %xmm0, -43(%rdi) >> +L(fwd_write_27bytes): >> + vmovups -27(%rsi), %xmm0 >> + vmovups %xmm0, -27(%rdi) >> +L(fwd_write_11bytes): >> + mov -11(%rsi), %rdx >> + mov -4(%rsi), %ecx >> + mov %rdx, -11(%rdi) >> + mov %ecx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_154bytes): >> + vmovups -154(%rsi), %xmm0 >> + vmovups %xmm0, -154(%rdi) >> +L(fwd_write_138bytes): >> + vmovups -138(%rsi), %xmm0 >> + vmovups %xmm0, -138(%rdi) >> +L(fwd_write_122bytes): >> + vmovups -122(%rsi), %xmm0 >> + vmovups %xmm0, -122(%rdi) >> +L(fwd_write_106bytes): >> + vmovups -106(%rsi), %xmm0 >> + vmovups %xmm0, -106(%rdi) >> +L(fwd_write_90bytes): >> + vmovups -90(%rsi), %xmm0 >> + vmovups %xmm0, -90(%rdi) >> +L(fwd_write_74bytes): >> + vmovups -74(%rsi), %xmm0 >> + vmovups %xmm0, -74(%rdi) >> +L(fwd_write_58bytes): >> + vmovups -58(%rsi), %xmm0 >> + vmovups %xmm0, -58(%rdi) >> +L(fwd_write_42bytes): >> + vmovups -42(%rsi), %xmm0 >> + vmovups %xmm0, -42(%rdi) >> +L(fwd_write_26bytes): >> + vmovups -26(%rsi), %xmm0 >> + vmovups %xmm0, -26(%rdi) >> +L(fwd_write_10bytes): >> + mov -10(%rsi), %rdx >> + mov -4(%rsi), %ecx >> + mov %rdx, -10(%rdi) >> + mov %ecx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_153bytes): >> + vmovups -153(%rsi), %xmm0 >> + vmovups %xmm0, -153(%rdi) >> +L(fwd_write_137bytes): >> + vmovups -137(%rsi), %xmm0 >> + vmovups %xmm0, -137(%rdi) >> +L(fwd_write_121bytes): >> + vmovups -121(%rsi), %xmm0 >> + vmovups %xmm0, -121(%rdi) >> +L(fwd_write_105bytes): >> + vmovups -105(%rsi), %xmm0 >> + vmovups %xmm0, -105(%rdi) >> +L(fwd_write_89bytes): >> + vmovups -89(%rsi), %xmm0 >> + vmovups %xmm0, -89(%rdi) >> +L(fwd_write_73bytes): >> + vmovups -73(%rsi), %xmm0 >> + vmovups %xmm0, -73(%rdi) >> +L(fwd_write_57bytes): >> + vmovups -57(%rsi), %xmm0 >> + vmovups %xmm0, -57(%rdi) >> +L(fwd_write_41bytes): >> + vmovups -41(%rsi), %xmm0 >> + vmovups %xmm0, -41(%rdi) >> +L(fwd_write_25bytes): >> + vmovups -25(%rsi), %xmm0 >> + vmovups %xmm0, -25(%rdi) >> +L(fwd_write_9bytes): >> + mov -9(%rsi), %rdx >> + mov %rdx, -9(%rdi) >> + mov -1(%rsi), %dl >> + mov %dl, -1(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_152bytes): >> + vmovups -152(%rsi), %xmm0 >> + vmovups %xmm0, -152(%rdi) >> +L(fwd_write_136bytes): >> + vmovups -136(%rsi), %xmm0 >> + vmovups %xmm0, -136(%rdi) >> +L(fwd_write_120bytes): >> + vmovups -120(%rsi), %xmm0 >> + vmovups %xmm0, -120(%rdi) >> +L(fwd_write_104bytes): >> + vmovups -104(%rsi), %xmm0 >> + vmovups %xmm0, -104(%rdi) >> +L(fwd_write_88bytes): >> + vmovups -88(%rsi), %xmm0 >> + vmovups %xmm0, -88(%rdi) >> +L(fwd_write_72bytes): >> + vmovups -72(%rsi), %xmm0 >> + vmovups %xmm0, -72(%rdi) >> +L(fwd_write_56bytes): >> + vmovups -56(%rsi), %xmm0 >> + vmovups %xmm0, -56(%rdi) >> +L(fwd_write_40bytes): >> + vmovups -40(%rsi), %xmm0 >> + vmovups %xmm0, -40(%rdi) >> +L(fwd_write_24bytes): >> + vmovups -24(%rsi), %xmm0 >> + vmovups %xmm0, -24(%rdi) >> +L(fwd_write_8bytes): >> + mov -8(%rsi), %rdx >> + mov %rdx, -8(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_151bytes): >> + vmovups -151(%rsi), %xmm0 >> + vmovups %xmm0, -151(%rdi) >> +L(fwd_write_135bytes): >> + vmovups -135(%rsi), %xmm0 >> + vmovups %xmm0, -135(%rdi) >> +L(fwd_write_119bytes): >> + vmovups -119(%rsi), %xmm0 >> + vmovups %xmm0, -119(%rdi) >> +L(fwd_write_103bytes): >> + vmovups -103(%rsi), %xmm0 >> + vmovups %xmm0, -103(%rdi) >> +L(fwd_write_87bytes): >> + vmovups -87(%rsi), %xmm0 >> + vmovups %xmm0, -87(%rdi) >> +L(fwd_write_71bytes): >> + vmovups -71(%rsi), %xmm0 >> + vmovups %xmm0, -71(%rdi) >> +L(fwd_write_55bytes): >> + vmovups -55(%rsi), %xmm0 >> + vmovups %xmm0, -55(%rdi) >> +L(fwd_write_39bytes): >> + vmovups -39(%rsi), %xmm0 >> + vmovups %xmm0, -39(%rdi) >> +L(fwd_write_23bytes): >> + vmovups -23(%rsi), %xmm0 >> + vmovups %xmm0, -23(%rdi) >> +L(fwd_write_7bytes): >> + mov -7(%rsi), %edx >> + mov -4(%rsi), %ecx >> + mov %edx, -7(%rdi) >> + mov %ecx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_150bytes): >> + vmovups -150(%rsi), %xmm0 >> + vmovups %xmm0, -150(%rdi) >> +L(fwd_write_134bytes): >> + vmovups -134(%rsi), %xmm0 >> + vmovups %xmm0, -134(%rdi) >> +L(fwd_write_118bytes): >> + vmovups -118(%rsi), %xmm0 >> + vmovups %xmm0, -118(%rdi) >> +L(fwd_write_102bytes): >> + vmovups -102(%rsi), %xmm0 >> + vmovups %xmm0, -102(%rdi) >> +L(fwd_write_86bytes): >> + vmovups -86(%rsi), %xmm0 >> + vmovups %xmm0, -86(%rdi) >> +L(fwd_write_70bytes): >> + vmovups -70(%rsi), %xmm0 >> + vmovups %xmm0, -70(%rdi) >> +L(fwd_write_54bytes): >> + vmovups -54(%rsi), %xmm0 >> + vmovups %xmm0, -54(%rdi) >> +L(fwd_write_38bytes): >> + vmovups -38(%rsi), %xmm0 >> + vmovups %xmm0, -38(%rdi) >> +L(fwd_write_22bytes): >> + vmovups -22(%rsi), %xmm0 >> + vmovups %xmm0, -22(%rdi) >> +L(fwd_write_6bytes): >> + mov -6(%rsi), %edx >> + mov -4(%rsi), %ecx >> + mov %edx, -6(%rdi) >> + mov %ecx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_149bytes): >> + vmovups -149(%rsi), %xmm0 >> + vmovups %xmm0, -149(%rdi) >> +L(fwd_write_133bytes): >> + vmovups -133(%rsi), %xmm0 >> + vmovups %xmm0, -133(%rdi) >> +L(fwd_write_117bytes): >> + vmovups -117(%rsi), %xmm0 >> + vmovups %xmm0, -117(%rdi) >> +L(fwd_write_101bytes): >> + vmovups -101(%rsi), %xmm0 >> + vmovups %xmm0, -101(%rdi) >> +L(fwd_write_85bytes): >> + vmovups -85(%rsi), %xmm0 >> + vmovups %xmm0, -85(%rdi) >> +L(fwd_write_69bytes): >> + vmovups -69(%rsi), %xmm0 >> + vmovups %xmm0, -69(%rdi) >> +L(fwd_write_53bytes): >> + vmovups -53(%rsi), %xmm0 >> + vmovups %xmm0, -53(%rdi) >> +L(fwd_write_37bytes): >> + vmovups -37(%rsi), %xmm0 >> + vmovups %xmm0, -37(%rdi) >> +L(fwd_write_21bytes): >> + vmovups -21(%rsi), %xmm0 >> + vmovups %xmm0, -21(%rdi) >> +L(fwd_write_5bytes): >> + mov -5(%rsi), %edx >> + mov %edx, -5(%rdi) >> + mov -1(%rsi), %dl >> + mov %dl, -1(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_148bytes): >> + vmovups -148(%rsi), %xmm0 >> + vmovups %xmm0, -148(%rdi) >> +L(fwd_write_132bytes): >> + vmovups -132(%rsi), %xmm0 >> + vmovups %xmm0, -132(%rdi) >> +L(fwd_write_116bytes): >> + vmovups -116(%rsi), %xmm0 >> + vmovups %xmm0, -116(%rdi) >> +L(fwd_write_100bytes): >> + vmovups -100(%rsi), %xmm0 >> + vmovups %xmm0, -100(%rdi) >> +L(fwd_write_84bytes): >> + vmovups -84(%rsi), %xmm0 >> + vmovups %xmm0, -84(%rdi) >> +L(fwd_write_68bytes): >> + vmovups -68(%rsi), %xmm0 >> + vmovups %xmm0, -68(%rdi) >> +L(fwd_write_52bytes): >> + vmovups -52(%rsi), %xmm0 >> + vmovups %xmm0, -52(%rdi) >> +L(fwd_write_36bytes): >> + vmovups -36(%rsi), %xmm0 >> + vmovups %xmm0, -36(%rdi) >> +L(fwd_write_20bytes): >> + vmovups -20(%rsi), %xmm0 >> + vmovups %xmm0, -20(%rdi) >> +L(fwd_write_4bytes): >> + mov -4(%rsi), %edx >> + mov %edx, -4(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_147bytes): >> + vmovups -147(%rsi), %xmm0 >> + vmovups %xmm0, -147(%rdi) >> +L(fwd_write_131bytes): >> + vmovups -131(%rsi), %xmm0 >> + vmovups %xmm0, -131(%rdi) >> +L(fwd_write_115bytes): >> + vmovups -115(%rsi), %xmm0 >> + vmovups %xmm0, -115(%rdi) >> +L(fwd_write_99bytes): >> + vmovups -99(%rsi), %xmm0 >> + vmovups %xmm0, -99(%rdi) >> +L(fwd_write_83bytes): >> + vmovups -83(%rsi), %xmm0 >> + vmovups %xmm0, -83(%rdi) >> +L(fwd_write_67bytes): >> + vmovups -67(%rsi), %xmm0 >> + vmovups %xmm0, -67(%rdi) >> +L(fwd_write_51bytes): >> + vmovups -51(%rsi), %xmm0 >> + vmovups %xmm0, -51(%rdi) >> +L(fwd_write_35bytes): >> + vmovups -35(%rsi), %xmm0 >> + vmovups %xmm0, -35(%rdi) >> +L(fwd_write_19bytes): >> + vmovups -19(%rsi), %xmm0 >> + vmovups %xmm0, -19(%rdi) >> +L(fwd_write_3bytes): >> + movzwl -3(%rsi), %edx >> + mov %dx, -3(%rdi) >> + movzbl -1(%rsi), %edx >> + mov %dl, -1(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_146bytes): >> + vmovups -146(%rsi), %xmm0 >> + vmovups %xmm0, -146(%rdi) >> +L(fwd_write_130bytes): >> + vmovups -130(%rsi), %xmm0 >> + vmovups %xmm0, -130(%rdi) >> +L(fwd_write_114bytes): >> + vmovups -114(%rsi), %xmm0 >> + vmovups %xmm0, -114(%rdi) >> +L(fwd_write_98bytes): >> + vmovups -98(%rsi), %xmm0 >> + vmovups %xmm0, -98(%rdi) >> +L(fwd_write_82bytes): >> + vmovups -82(%rsi), %xmm0 >> + vmovups %xmm0, -82(%rdi) >> +L(fwd_write_66bytes): >> + vmovups -66(%rsi), %xmm0 >> + vmovups %xmm0, -66(%rdi) >> +L(fwd_write_50bytes): >> + vmovups -50(%rsi), %xmm0 >> + vmovups %xmm0, -50(%rdi) >> +L(fwd_write_34bytes): >> + vmovups -34(%rsi), %xmm0 >> + vmovups %xmm0, -34(%rdi) >> +L(fwd_write_18bytes): >> + vmovups -18(%rsi), %xmm0 >> + vmovups %xmm0, -18(%rdi) >> +L(fwd_write_2bytes): >> + movzwl -2(%rsi), %edx >> + mov %dx, -2(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(fwd_write_145bytes): >> + vmovups -145(%rsi), %xmm0 >> + vmovups %xmm0, -145(%rdi) >> +L(fwd_write_129bytes): >> + vmovups -129(%rsi), %xmm0 >> + vmovups %xmm0, -129(%rdi) >> +L(fwd_write_113bytes): >> + vmovups -113(%rsi), %xmm0 >> + vmovups %xmm0, -113(%rdi) >> +L(fwd_write_97bytes): >> + vmovups -97(%rsi), %xmm0 >> + vmovups %xmm0, -97(%rdi) >> +L(fwd_write_81bytes): >> + vmovups -81(%rsi), %xmm0 >> + vmovups %xmm0, -81(%rdi) >> +L(fwd_write_65bytes): >> + vmovups -65(%rsi), %xmm0 >> + vmovups %xmm0, -65(%rdi) >> +L(fwd_write_49bytes): >> + vmovups -49(%rsi), %xmm0 >> + vmovups %xmm0, -49(%rdi) >> +L(fwd_write_33bytes): >> + vmovups -33(%rsi), %xmm0 >> + vmovups %xmm0, -33(%rdi) >> +L(fwd_write_17bytes): >> + vmovups -17(%rsi), %xmm0 >> + vmovups %xmm0, -17(%rdi) >> +L(fwd_write_1bytes): >> + movzbl -1(%rsi), %edx >> + mov %dl, -1(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_144bytes): >> + vmovups 128(%rsi), %xmm0 >> + vmovups %xmm0, 128(%rdi) >> +L(bwd_write_128bytes): >> + vmovups 112(%rsi), %xmm0 >> + vmovups %xmm0, 112(%rdi) >> +L(bwd_write_112bytes): >> + vmovups 96(%rsi), %xmm0 >> + vmovups %xmm0, 96(%rdi) >> +L(bwd_write_96bytes): >> + vmovups 80(%rsi), %xmm0 >> + vmovups %xmm0, 80(%rdi) >> +L(bwd_write_80bytes): >> + vmovups 64(%rsi), %xmm0 >> + vmovups %xmm0, 64(%rdi) >> +L(bwd_write_64bytes): >> + vmovups 48(%rsi), %xmm0 >> + vmovups %xmm0, 48(%rdi) >> +L(bwd_write_48bytes): >> + vmovups 32(%rsi), %xmm0 >> + vmovups %xmm0, 32(%rdi) >> +L(bwd_write_32bytes): >> + vmovups 16(%rsi), %xmm0 >> + vmovups %xmm0, 16(%rdi) >> +L(bwd_write_16bytes): >> + vmovups (%rsi), %xmm0 >> + vmovups %xmm0, (%rdi) >> +L(bwd_write_0bytes): >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_159bytes): >> + vmovups 143(%rsi), %xmm0 >> + vmovups %xmm0, 143(%rdi) >> +L(bwd_write_143bytes): >> + vmovups 127(%rsi), %xmm0 >> + vmovups %xmm0, 127(%rdi) >> +L(bwd_write_127bytes): >> + vmovups 111(%rsi), %xmm0 >> + vmovups %xmm0, 111(%rdi) >> +L(bwd_write_111bytes): >> + vmovups 95(%rsi), %xmm0 >> + vmovups %xmm0, 95(%rdi) >> +L(bwd_write_95bytes): >> + vmovups 79(%rsi), %xmm0 >> + vmovups %xmm0, 79(%rdi) >> +L(bwd_write_79bytes): >> + vmovups 63(%rsi), %xmm0 >> + vmovups %xmm0, 63(%rdi) >> +L(bwd_write_63bytes): >> + vmovups 47(%rsi), %xmm0 >> + vmovups %xmm0, 47(%rdi) >> +L(bwd_write_47bytes): >> + vmovups 31(%rsi), %xmm0 >> + vmovups %xmm0, 31(%rdi) >> +L(bwd_write_31bytes): >> + vmovups 15(%rsi), %xmm0 >> + vmovups %xmm0, 15(%rdi) >> +L(bwd_write_15bytes): >> + mov (%rsi), %rcx >> + mov 7(%rsi), %rdx >> + mov %rcx, (%rdi) >> + mov %rdx, 7(%rdi) >> + ret >> + >> + >> + ALIGN(4) >> +L(bwd_write_158bytes): >> + vmovups 142(%rsi), %xmm0 >> + vmovups %xmm0, 142(%rdi) >> +L(bwd_write_142bytes): >> + vmovups 126(%rsi), %xmm0 >> + vmovups %xmm0, 126(%rdi) >> +L(bwd_write_126bytes): >> + vmovups 110(%rsi), %xmm0 >> + vmovups %xmm0, 110(%rdi) >> +L(bwd_write_110bytes): >> + vmovups 94(%rsi), %xmm0 >> + vmovups %xmm0, 94(%rdi) >> +L(bwd_write_94bytes): >> + vmovups 78(%rsi), %xmm0 >> + vmovups %xmm0, 78(%rdi) >> +L(bwd_write_78bytes): >> + vmovups 62(%rsi), %xmm0 >> + vmovups %xmm0, 62(%rdi) >> +L(bwd_write_62bytes): >> + vmovups 46(%rsi), %xmm0 >> + vmovups %xmm0, 46(%rdi) >> +L(bwd_write_46bytes): >> + vmovups 30(%rsi), %xmm0 >> + vmovups %xmm0, 30(%rdi) >> +L(bwd_write_30bytes): >> + vmovups 14(%rsi), %xmm0 >> + vmovups %xmm0, 14(%rdi) >> +L(bwd_write_14bytes): >> + mov (%rsi), %rcx >> + mov 6(%rsi), %rdx >> + mov %rcx, (%rdi) >> + mov %rdx, 6(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_157bytes): >> + vmovups 141(%rsi), %xmm0 >> + vmovups %xmm0, 141(%rdi) >> +L(bwd_write_141bytes): >> + vmovups 125(%rsi), %xmm0 >> + vmovups %xmm0, 125(%rdi) >> +L(bwd_write_125bytes): >> + vmovups 109(%rsi), %xmm0 >> + vmovups %xmm0, 109(%rdi) >> +L(bwd_write_109bytes): >> + vmovups 93(%rsi), %xmm0 >> + vmovups %xmm0, 93(%rdi) >> +L(bwd_write_93bytes): >> + vmovups 77(%rsi), %xmm0 >> + vmovups %xmm0, 77(%rdi) >> +L(bwd_write_77bytes): >> + vmovups 61(%rsi), %xmm0 >> + vmovups %xmm0, 61(%rdi) >> +L(bwd_write_61bytes): >> + vmovups 45(%rsi), %xmm0 >> + vmovups %xmm0, 45(%rdi) >> +L(bwd_write_45bytes): >> + vmovups 29(%rsi), %xmm0 >> + vmovups %xmm0, 29(%rdi) >> +L(bwd_write_29bytes): >> + vmovups 13(%rsi), %xmm0 >> + vmovups %xmm0, 13(%rdi) >> +L(bwd_write_13bytes): >> + mov (%rsi), %rcx >> + mov 5(%rsi), %rdx >> + mov %rcx, (%rdi) >> + mov %rdx, 5(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_156bytes): >> + vmovups 140(%rsi), %xmm0 >> + vmovups %xmm0, 140(%rdi) >> +L(bwd_write_140bytes): >> + vmovups 124(%rsi), %xmm0 >> + vmovups %xmm0, 124(%rdi) >> +L(bwd_write_124bytes): >> + vmovups 108(%rsi), %xmm0 >> + vmovups %xmm0, 108(%rdi) >> +L(bwd_write_108bytes): >> + vmovups 92(%rsi), %xmm0 >> + vmovups %xmm0, 92(%rdi) >> +L(bwd_write_92bytes): >> + vmovups 76(%rsi), %xmm0 >> + vmovups %xmm0, 76(%rdi) >> +L(bwd_write_76bytes): >> + vmovups 60(%rsi), %xmm0 >> + vmovups %xmm0, 60(%rdi) >> +L(bwd_write_60bytes): >> + vmovups 44(%rsi), %xmm0 >> + vmovups %xmm0, 44(%rdi) >> +L(bwd_write_44bytes): >> + vmovups 28(%rsi), %xmm0 >> + vmovups %xmm0, 28(%rdi) >> +L(bwd_write_28bytes): >> + vmovups 12(%rsi), %xmm0 >> + vmovups %xmm0, 12(%rdi) >> +L(bwd_write_12bytes): >> + mov 4(%rsi), %rdx >> + mov %rdx, 4(%rdi) >> + mov (%rsi), %edx >> + mov %edx, (%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_155bytes): >> + vmovups 139(%rsi), %xmm0 >> + vmovups %xmm0, 139(%rdi) >> +L(bwd_write_139bytes): >> + vmovups 123(%rsi), %xmm0 >> + vmovups %xmm0, 123(%rdi) >> +L(bwd_write_123bytes): >> + vmovups 107(%rsi), %xmm0 >> + vmovups %xmm0, 107(%rdi) >> +L(bwd_write_107bytes): >> + vmovups 91(%rsi), %xmm0 >> + vmovups %xmm0, 91(%rdi) >> +L(bwd_write_91bytes): >> + vmovups 75(%rsi), %xmm0 >> + vmovups %xmm0, 75(%rdi) >> +L(bwd_write_75bytes): >> + vmovups 59(%rsi), %xmm0 >> + vmovups %xmm0, 59(%rdi) >> +L(bwd_write_59bytes): >> + vmovups 43(%rsi), %xmm0 >> + vmovups %xmm0, 43(%rdi) >> +L(bwd_write_43bytes): >> + vmovups 27(%rsi), %xmm0 >> + vmovups %xmm0, 27(%rdi) >> +L(bwd_write_27bytes): >> + vmovups 11(%rsi), %xmm0 >> + vmovups %xmm0, 11(%rdi) >> +L(bwd_write_11bytes): >> + mov (%rsi), %rcx >> + mov 7(%rsi), %edx >> + mov %rcx, (%rdi) >> + mov %edx, 7(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_154bytes): >> + vmovups 138(%rsi), %xmm0 >> + vmovups %xmm0, 138(%rdi) >> +L(bwd_write_138bytes): >> + vmovups 122(%rsi), %xmm0 >> + vmovups %xmm0, 122(%rdi) >> +L(bwd_write_122bytes): >> + vmovups 106(%rsi), %xmm0 >> + vmovups %xmm0, 106(%rdi) >> +L(bwd_write_106bytes): >> + vmovups 90(%rsi), %xmm0 >> + vmovups %xmm0, 90(%rdi) >> +L(bwd_write_90bytes): >> + vmovups 74(%rsi), %xmm0 >> + vmovups %xmm0, 74(%rdi) >> +L(bwd_write_74bytes): >> + vmovups 58(%rsi), %xmm0 >> + vmovups %xmm0, 58(%rdi) >> +L(bwd_write_58bytes): >> + vmovups 42(%rsi), %xmm0 >> + vmovups %xmm0, 42(%rdi) >> +L(bwd_write_42bytes): >> + vmovups 26(%rsi), %xmm0 >> + vmovups %xmm0, 26(%rdi) >> +L(bwd_write_26bytes): >> + vmovups 10(%rsi), %xmm0 >> + vmovups %xmm0, 10(%rdi) >> +L(bwd_write_10bytes): >> + mov (%rsi), %rcx >> + mov 6(%rsi), %edx >> + mov %rcx, (%rdi) >> + mov %edx, 6(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_153bytes): >> + vmovups 137(%rsi), %xmm0 >> + vmovups %xmm0, 137(%rdi) >> +L(bwd_write_137bytes): >> + vmovups 121(%rsi), %xmm0 >> + vmovups %xmm0, 121(%rdi) >> +L(bwd_write_121bytes): >> + vmovups 105(%rsi), %xmm0 >> + vmovups %xmm0, 105(%rdi) >> +L(bwd_write_105bytes): >> + vmovups 89(%rsi), %xmm0 >> + vmovups %xmm0, 89(%rdi) >> +L(bwd_write_89bytes): >> + vmovups 73(%rsi), %xmm0 >> + vmovups %xmm0, 73(%rdi) >> +L(bwd_write_73bytes): >> + vmovups 57(%rsi), %xmm0 >> + vmovups %xmm0, 57(%rdi) >> +L(bwd_write_57bytes): >> + vmovups 41(%rsi), %xmm0 >> + vmovups %xmm0, 41(%rdi) >> +L(bwd_write_41bytes): >> + vmovups 25(%rsi), %xmm0 >> + vmovups %xmm0, 25(%rdi) >> +L(bwd_write_25bytes): >> + vmovups 9(%rsi), %xmm0 >> + vmovups %xmm0, 9(%rdi) >> +L(bwd_write_9bytes): >> + mov (%rsi), %rcx >> + mov 5(%rsi), %edx >> + mov %rcx, (%rdi) >> + mov %edx, 5(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_152bytes): >> + vmovups 136(%rsi), %xmm0 >> + vmovups %xmm0, 136(%rdi) >> +L(bwd_write_136bytes): >> + vmovups 120(%rsi), %xmm0 >> + vmovups %xmm0, 120(%rdi) >> +L(bwd_write_120bytes): >> + vmovups 104(%rsi), %xmm0 >> + vmovups %xmm0, 104(%rdi) >> +L(bwd_write_104bytes): >> + vmovups 88(%rsi), %xmm0 >> + vmovups %xmm0, 88(%rdi) >> +L(bwd_write_88bytes): >> + vmovups 72(%rsi), %xmm0 >> + vmovups %xmm0, 72(%rdi) >> +L(bwd_write_72bytes): >> + vmovups 56(%rsi), %xmm0 >> + vmovups %xmm0, 56(%rdi) >> +L(bwd_write_56bytes): >> + vmovups 40(%rsi), %xmm0 >> + vmovups %xmm0, 40(%rdi) >> +L(bwd_write_40bytes): >> + vmovups 24(%rsi), %xmm0 >> + vmovups %xmm0, 24(%rdi) >> +L(bwd_write_24bytes): >> + vmovups 8(%rsi), %xmm0 >> + vmovups %xmm0, 8(%rdi) >> +L(bwd_write_8bytes): >> + mov (%rsi), %rdx >> + mov %rdx, (%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_151bytes): >> + vmovups 135(%rsi), %xmm0 >> + vmovups %xmm0, 135(%rdi) >> +L(bwd_write_135bytes): >> + vmovups 119(%rsi), %xmm0 >> + vmovups %xmm0, 119(%rdi) >> +L(bwd_write_119bytes): >> + vmovups 103(%rsi), %xmm0 >> + vmovups %xmm0, 103(%rdi) >> +L(bwd_write_103bytes): >> + vmovups 87(%rsi), %xmm0 >> + vmovups %xmm0, 87(%rdi) >> +L(bwd_write_87bytes): >> + vmovups 71(%rsi), %xmm0 >> + vmovups %xmm0, 71(%rdi) >> +L(bwd_write_71bytes): >> + vmovups 55(%rsi), %xmm0 >> + vmovups %xmm0, 55(%rdi) >> +L(bwd_write_55bytes): >> + vmovups 39(%rsi), %xmm0 >> + vmovups %xmm0, 39(%rdi) >> +L(bwd_write_39bytes): >> + vmovups 23(%rsi), %xmm0 >> + vmovups %xmm0, 23(%rdi) >> +L(bwd_write_23bytes): >> + vmovups 7(%rsi), %xmm0 >> + vmovups %xmm0, 7(%rdi) >> +L(bwd_write_7bytes): >> + mov (%rsi), %ecx >> + mov 3(%rsi), %edx >> + mov %ecx, (%rdi) >> + mov %edx, 3(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_150bytes): >> + vmovups 134(%rsi), %xmm0 >> + vmovups %xmm0, 134(%rdi) >> +L(bwd_write_134bytes): >> + vmovups 118(%rsi), %xmm0 >> + vmovups %xmm0, 118(%rdi) >> +L(bwd_write_118bytes): >> + vmovups 102(%rsi), %xmm0 >> + vmovups %xmm0, 102(%rdi) >> +L(bwd_write_102bytes): >> + vmovups 86(%rsi), %xmm0 >> + vmovups %xmm0, 86(%rdi) >> +L(bwd_write_86bytes): >> + vmovups 70(%rsi), %xmm0 >> + vmovups %xmm0, 70(%rdi) >> +L(bwd_write_70bytes): >> + vmovups 54(%rsi), %xmm0 >> + vmovups %xmm0, 54(%rdi) >> +L(bwd_write_54bytes): >> + vmovups 38(%rsi), %xmm0 >> + vmovups %xmm0, 38(%rdi) >> +L(bwd_write_38bytes): >> + vmovups 22(%rsi), %xmm0 >> + vmovups %xmm0, 22(%rdi) >> +L(bwd_write_22bytes): >> + vmovups 6(%rsi), %xmm0 >> + vmovups %xmm0, 6(%rdi) >> +L(bwd_write_6bytes): >> + mov (%rsi), %ecx >> + mov 2(%rsi), %edx >> + mov %ecx, (%rdi) >> + mov %edx, 2(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_149bytes): >> + vmovups 133(%rsi), %xmm0 >> + vmovups %xmm0, 133(%rdi) >> +L(bwd_write_133bytes): >> + vmovups 117(%rsi), %xmm0 >> + vmovups %xmm0, 117(%rdi) >> +L(bwd_write_117bytes): >> + vmovups 101(%rsi), %xmm0 >> + vmovups %xmm0, 101(%rdi) >> +L(bwd_write_101bytes): >> + vmovups 85(%rsi), %xmm0 >> + vmovups %xmm0, 85(%rdi) >> +L(bwd_write_85bytes): >> + vmovups 69(%rsi), %xmm0 >> + vmovups %xmm0, 69(%rdi) >> +L(bwd_write_69bytes): >> + vmovups 53(%rsi), %xmm0 >> + vmovups %xmm0, 53(%rdi) >> +L(bwd_write_53bytes): >> + vmovups 37(%rsi), %xmm0 >> + vmovups %xmm0, 37(%rdi) >> +L(bwd_write_37bytes): >> + vmovups 21(%rsi), %xmm0 >> + vmovups %xmm0, 21(%rdi) >> +L(bwd_write_21bytes): >> + vmovups 5(%rsi), %xmm0 >> + vmovups %xmm0, 5(%rdi) >> +L(bwd_write_5bytes): >> + mov (%rsi), %ecx >> + mov 1(%rsi), %edx >> + mov %ecx, (%rdi) >> + mov %edx, 1(%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_148bytes): >> + vmovups 132(%rsi), %xmm0 >> + vmovups %xmm0, 132(%rdi) >> +L(bwd_write_132bytes): >> + vmovups 116(%rsi), %xmm0 >> + vmovups %xmm0, 116(%rdi) >> +L(bwd_write_116bytes): >> + vmovups 100(%rsi), %xmm0 >> + vmovups %xmm0, 100(%rdi) >> +L(bwd_write_100bytes): >> + vmovups 84(%rsi), %xmm0 >> + vmovups %xmm0, 84(%rdi) >> +L(bwd_write_84bytes): >> + vmovups 68(%rsi), %xmm0 >> + vmovups %xmm0, 68(%rdi) >> +L(bwd_write_68bytes): >> + vmovups 52(%rsi), %xmm0 >> + vmovups %xmm0, 52(%rdi) >> +L(bwd_write_52bytes): >> + vmovups 36(%rsi), %xmm0 >> + vmovups %xmm0, 36(%rdi) >> +L(bwd_write_36bytes): >> + vmovups 20(%rsi), %xmm0 >> + vmovups %xmm0, 20(%rdi) >> +L(bwd_write_20bytes): >> + vmovups 4(%rsi), %xmm0 >> + vmovups %xmm0, 4(%rdi) >> +L(bwd_write_4bytes): >> + mov (%rsi), %edx >> + mov %edx, (%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_147bytes): >> + vmovups 131(%rsi), %xmm0 >> + vmovups %xmm0, 131(%rdi) >> +L(bwd_write_131bytes): >> + vmovups 115(%rsi), %xmm0 >> + vmovups %xmm0, 115(%rdi) >> +L(bwd_write_115bytes): >> + vmovups 99(%rsi), %xmm0 >> + vmovups %xmm0, 99(%rdi) >> +L(bwd_write_99bytes): >> + vmovups 83(%rsi), %xmm0 >> + vmovups %xmm0, 83(%rdi) >> +L(bwd_write_83bytes): >> + vmovups 67(%rsi), %xmm0 >> + vmovups %xmm0, 67(%rdi) >> +L(bwd_write_67bytes): >> + vmovups 51(%rsi), %xmm0 >> + vmovups %xmm0, 51(%rdi) >> +L(bwd_write_51bytes): >> + vmovups 35(%rsi), %xmm0 >> + vmovups %xmm0, 35(%rdi) >> +L(bwd_write_35bytes): >> + vmovups 19(%rsi), %xmm0 >> + vmovups %xmm0, 19(%rdi) >> +L(bwd_write_19bytes): >> + vmovups 3(%rsi), %xmm0 >> + vmovups %xmm0, 3(%rdi) >> +L(bwd_write_3bytes): >> + movzwl 1(%rsi), %edx >> + mov %dx, 1(%rdi) >> + movzbl (%rsi), %edx >> + mov %dl, (%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_146bytes): >> + vmovups 130(%rsi), %xmm0 >> + vmovups %xmm0, 130(%rdi) >> +L(bwd_write_130bytes): >> + vmovups 114(%rsi), %xmm0 >> + vmovups %xmm0, 114(%rdi) >> +L(bwd_write_114bytes): >> + vmovups 98(%rsi), %xmm0 >> + vmovups %xmm0, 98(%rdi) >> +L(bwd_write_98bytes): >> + vmovups 82(%rsi), %xmm0 >> + vmovups %xmm0, 82(%rdi) >> +L(bwd_write_82bytes): >> + vmovups 66(%rsi), %xmm0 >> + vmovups %xmm0, 66(%rdi) >> +L(bwd_write_66bytes): >> + vmovups 50(%rsi), %xmm0 >> + vmovups %xmm0, 50(%rdi) >> +L(bwd_write_50bytes): >> + vmovups 34(%rsi), %xmm0 >> + vmovups %xmm0, 34(%rdi) >> +L(bwd_write_34bytes): >> + vmovups 18(%rsi), %xmm0 >> + vmovups %xmm0, 18(%rdi) >> +L(bwd_write_18bytes): >> + vmovups 2(%rsi), %xmm0 >> + vmovups %xmm0, 2(%rdi) >> +L(bwd_write_2bytes): >> + movzwl (%rsi), %edx >> + mov %dx, (%rdi) >> + ret >> + >> + ALIGN(4) >> +L(bwd_write_145bytes): >> + vmovups 129(%rsi), %xmm0 >> + vmovups %xmm0, 129(%rdi) >> +L(bwd_write_129bytes): >> + vmovups 113(%rsi), %xmm0 >> + vmovups %xmm0, 113(%rdi) >> +L(bwd_write_113bytes): >> + vmovups 97(%rsi), %xmm0 >> + vmovups %xmm0, 97(%rdi) >> +L(bwd_write_97bytes): >> + vmovups 81(%rsi), %xmm0 >> + vmovups %xmm0, 81(%rdi) >> +L(bwd_write_81bytes): >> + vmovups 65(%rsi), %xmm0 >> + vmovups %xmm0, 65(%rdi) >> +L(bwd_write_65bytes): >> + vmovups 49(%rsi), %xmm0 >> + vmovups %xmm0, 49(%rdi) >> +L(bwd_write_49bytes): >> + vmovups 33(%rsi), %xmm0 >> + vmovups %xmm0, 33(%rdi) >> +L(bwd_write_33bytes): >> + vmovups 17(%rsi), %xmm0 >> + vmovups %xmm0, 17(%rdi) >> +L(bwd_write_17bytes): >> + vmovups 1(%rsi), %xmm0 >> + vmovups %xmm0, 1(%rdi) >> +L(bwd_write_1bytes): >> + movzbl (%rsi), %edx >> + mov %dl, (%rdi) >> + ret >> + >> +END (MEMCPY) >> + >> + .section .rodata.avx2,"a",@progbits >> + ALIGN (3) >> +L(table_159_bytes_bwd): >> + .long JMPTBL (L(bwd_write_0bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_1bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_2bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_3bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_4bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_5bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_6bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_7bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_8bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_9bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_10bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_11bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_12bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_13bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_14bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_15bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_16bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_17bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_18bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_19bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_20bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_21bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_22bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_23bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_24bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_25bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_26bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_27bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_28bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_29bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_30bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_31bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_32bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_33bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_34bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_35bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_36bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_37bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_38bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_39bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_40bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_41bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_42bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_43bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_44bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_45bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_46bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_47bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_48bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_49bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_50bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_51bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_52bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_53bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_54bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_55bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_56bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_57bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_58bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_59bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_60bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_61bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_62bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_63bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_64bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_65bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_66bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_67bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_68bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_69bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_70bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_71bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_72bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_73bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_74bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_75bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_76bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_77bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_78bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_79bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_80bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_81bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_82bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_83bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_84bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_85bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_86bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_87bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_88bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_89bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_90bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_91bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_92bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_93bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_94bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_95bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_96bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_97bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_98bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_99bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_100bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_101bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_102bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_103bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_104bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_105bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_106bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_107bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_108bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_109bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_110bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_111bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_112bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_113bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_114bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_115bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_116bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_117bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_118bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_119bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_120bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_121bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_122bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_123bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_124bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_125bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_126bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_127bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_128bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_129bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_130bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_131bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_132bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_133bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_134bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_135bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_136bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_137bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_138bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_139bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_140bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_141bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_142bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_143bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_144bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_145bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_146bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_147bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_148bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_149bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_150bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_151bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_152bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_153bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_154bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_155bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_156bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_157bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_158bytes), L(table_159_bytes_bwd)) >> + .long JMPTBL (L(bwd_write_159bytes), L(table_159_bytes_bwd)) >> + >> + ALIGN (4) >> +L(table_159_bytes_fwd): >> + .long JMPTBL (L(fwd_write_0bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_1bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_2bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_3bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_4bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_5bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_6bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_7bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_8bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_9bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_10bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_11bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_12bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_13bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_14bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_15bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_16bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_17bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_18bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_19bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_20bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_21bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_22bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_23bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_24bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_25bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_26bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_27bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_28bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_29bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_30bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_31bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_32bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_33bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_34bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_35bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_36bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_37bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_38bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_39bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_40bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_41bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_42bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_43bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_44bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_45bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_46bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_47bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_48bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_49bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_50bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_51bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_52bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_53bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_54bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_55bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_56bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_57bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_58bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_59bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_60bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_61bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_62bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_63bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_64bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_65bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_66bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_67bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_68bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_69bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_70bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_71bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_72bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_73bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_74bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_75bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_76bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_77bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_78bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_79bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_80bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_81bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_82bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_83bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_84bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_85bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_86bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_87bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_88bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_89bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_90bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_91bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_92bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_93bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_94bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_95bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_96bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_97bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_98bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_99bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_100bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_101bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_102bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_103bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_104bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_105bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_106bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_107bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_108bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_109bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_110bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_111bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_112bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_113bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_114bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_115bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_116bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_117bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_118bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_119bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_120bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_121bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_122bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_123bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_124bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_125bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_126bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_127bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_128bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_129bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_130bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_131bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_132bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_133bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_134bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_135bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_136bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_137bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_138bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_139bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_140bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_141bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_142bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_143bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_144bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_145bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_146bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_147bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_148bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_149bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_150bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_151bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_152bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_153bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_154bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_155bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_156bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_157bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_158bytes), L(table_159_bytes_fwd)) >> + .long JMPTBL (L(fwd_write_159bytes), L(table_159_bytes_fwd)) >> + >> + ALIGN (4) >> +L(shl_table_fwd): >> + .long JMPTBL (L(shl_0), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_1), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_2), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_3), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_4), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_5), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_6), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_7), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_8), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_9), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_10), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_11), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_12), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_13), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_14), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_15), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_16), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_17), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_18), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_19), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_20), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_21), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_22), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_23), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_24), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_25), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_26), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_27), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_28), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_29), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_30), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_31), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_32), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_33), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_34), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_35), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_36), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_37), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_38), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_39), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_40), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_41), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_42), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_43), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_44), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_45), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_46), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_47), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_48), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_49), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_50), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_51), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_52), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_53), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_54), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_55), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_56), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_57), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_58), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_59), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_60), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_61), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_62), L(shl_table_fwd)) >> + .long JMPTBL (L(shl_63), L(shl_table_fwd)) >> + >> + ALIGN (4) >> +L(shl_table_bwd): >> + .long JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_16_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_17_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_18_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_19_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_20_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_21_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_22_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_23_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_24_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_25_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_26_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_27_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_28_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_29_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_30_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_31_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_32_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_33_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_34_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_35_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_36_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_37_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_38_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_39_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_40_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_41_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_42_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_43_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_44_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_45_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_46_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_47_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_48_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_49_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_50_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_51_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_52_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_53_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_54_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_55_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_56_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_57_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_58_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_59_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_60_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_61_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_62_bwd), L(shl_table_bwd)) >> + .long JMPTBL (L(shl_63_bwd), L(shl_table_bwd)) >> +#endif >> diff --git a/sysdeps/x86_64/multiarch/memcpy.S >> b/sysdeps/x86_64/multiarch/memcpy.S >> index c7a193f..ce78236 100644 >> --- a/sysdeps/x86_64/multiarch/memcpy.S >> +++ b/sysdeps/x86_64/multiarch/memcpy.S >> @@ -38,7 +38,10 @@ ENTRY(__new_memcpy) >> leaq __memcpy_ssse3(%rip), %rax >> testl $bit_Fast_Copy_Backward, >> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) >> jz 2f >> - leaq __memcpy_ssse3_back(%rip), %rax >> + leaq __memcpy_ssse3_back(%rip), %rax >> + testl $bit_AVX2, __cpu_features+CPUID_OFFSET+index_AVX2(%rip) >> + jz 2f >> + leaq __memcpy_avx2(%rip), %rax >> 2: ret >> END(__new_memcpy) >> >> diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S >> b/sysdeps/x86_64/multiarch/memcpy_chk.S >> index 2283cf6..3bf0497 100644 >> --- a/sysdeps/x86_64/multiarch/memcpy_chk.S >> +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S >> @@ -39,6 +39,9 @@ ENTRY(__memcpy_chk) >> testl $bit_Fast_Copy_Backward, >> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) >> jz 2f >> leaq __memcpy_chk_ssse3_back(%rip), %rax >> + testl $bit_AVX2, __cpu_features+CPUID_OFFSET+index_AVX2(%rip) >> + jz 2f >> + leaq __memcpy_chk_avx2(%rip), %rax >> 2: ret >> END(__memcpy_chk) >> # else >> diff --git a/sysdeps/x86_64/multiarch/memmove-avx2.S >> b/sysdeps/x86_64/multiarch/memmove-avx2.S >> new file mode 100644 >> index 0000000..33bbbf0 >> --- /dev/null >> +++ b/sysdeps/x86_64/multiarch/memmove-avx2.S >> @@ -0,0 +1,4 @@ >> +#define USE_AS_MEMMOVE >> +#define MEMCPY __memmove_avx2 >> +#define MEMCPY_CHK __memmove_chk_avx2 >> +#include "memcpy-avx2.S" >> diff --git a/sysdeps/x86_64/multiarch/memmove.c >> b/sysdeps/x86_64/multiarch/memmove.c >> index af870d4..8bc9dc8 100644 >> --- a/sysdeps/x86_64/multiarch/memmove.c >> +++ b/sysdeps/x86_64/multiarch/memmove.c >> @@ -35,6 +35,8 @@ >> extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden; >> extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; >> extern __typeof (__redirect_memmove) __memmove_ssse3_back >> attribute_hidden; >> +extern __typeof (__redirect_memmove) __memmove_avx2 attribute_hidden; >> + >> #endif >> >> #include "string/memmove.c" >> @@ -47,10 +49,11 @@ extern __typeof (__redirect_memmove) >> __memmove_ssse3_back attribute_hidden; >> ifunc symbol properly. */ >> extern __typeof (__redirect_memmove) __libc_memmove; >> libc_ifunc (__libc_memmove, >> - HAS_SSSE3 >> + HAS_AVX2 ? __memmove_avx2 : >> + (HAS_SSSE3 >> ? (HAS_FAST_COPY_BACKWARD >> ? __memmove_ssse3_back : __memmove_ssse3) >> - : __memmove_sse2) >> + : __memmove_sse2)); >> >> strong_alias (__libc_memmove, memmove) >> >> diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c >> b/sysdeps/x86_64/multiarch/memmove_chk.c >> index da8160d..a64811f 100644 >> --- a/sysdeps/x86_64/multiarch/memmove_chk.c >> +++ b/sysdeps/x86_64/multiarch/memmove_chk.c >> @@ -25,11 +25,13 @@ >> extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden; >> extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; >> extern __typeof (__memmove_chk) __memmove_chk_ssse3_back >> attribute_hidden; >> +extern __typeof (__memmove_chk) __memmove_chk_avx2 attribute_hidden; >> >> #include "debug/memmove_chk.c" >> >> libc_ifunc (__memmove_chk, >> - HAS_SSSE3 >> + HAS_AVX2 ? __memmove_chk_avx2 : >> + (HAS_SSSE3 >> ? (HAS_FAST_COPY_BACKWARD >> ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) >> - : __memmove_chk_sse2); >> + : __memmove_chk_sse2)); >> diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx2.S >> b/sysdeps/x86_64/multiarch/mempcpy-avx2.S >> new file mode 100644 >> index 0000000..f1f8017 >> --- /dev/null >> +++ b/sysdeps/x86_64/multiarch/mempcpy-avx2.S >> @@ -0,0 +1,4 @@ >> +#define USE_AS_MEMPCPY >> +#define MEMCPY __mempcpy_avx2 >> +#define MEMCPY_CHK __mempcpy_chk_avx2 >> +#include "memcpy-avx2.S" >> diff --git a/sysdeps/x86_64/multiarch/mempcpy.S >> b/sysdeps/x86_64/multiarch/mempcpy.S >> index b5a5d6d..40b77fe 100644 >> --- a/sysdeps/x86_64/multiarch/mempcpy.S >> +++ b/sysdeps/x86_64/multiarch/mempcpy.S >> @@ -37,6 +37,9 @@ ENTRY(__mempcpy) >> testl $bit_Fast_Copy_Backward, >> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) >> jz 2f >> leaq __mempcpy_ssse3_back(%rip), %rax >> + testl $bit_AVX2, __cpu_features+CPUID_OFFSET+index_AVX2(%rip) >> + jz 2f >> + leaq __mempcpy_avx2(%rip), %rax >> 2: ret >> END(__mempcpy) >> >> diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S >> b/sysdeps/x86_64/multiarch/mempcpy_chk.S >> index a3d3a59..04050b5 100644 >> --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S >> +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S >> @@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk) >> testl $bit_Fast_Copy_Backward, >> __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) >> jz 2f >> leaq __mempcpy_chk_ssse3_back(%rip), %rax >> + testl $bit_AVX2, __cpu_features+CPUID_OFFSET+index_AVX2(%rip) >> + jz 2f >> + leaq __mempcpy_chk_avx2(%rip), %rax >> 2: ret >> END(__mempcpy_chk) >> # else >> -- >> 1.8.1.4 >> >
Attachment:
gcc-test-memcpy-output
Description: Text document
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |