This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
- From: ling dot ma dot program at gmail dot com
- To: libc-alpha at sourceware dot org
- Cc: aj at suse dot com, neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
- Date: Mon, 29 Jul 2013 05:40:55 -0400
- Subject: [PATCH RFC V4] Improve 64bit memcpy/memove for Corei7 with unaligned avx instruction
From: Ma Ling <ling.ml@alibaba-inc.com>
We manage to avoid branch instructions, and force destination to be aligned
with avx instruction, then modified gcc.403 so that we can only measure memcpy function,
gcc.403 benchmarks indicate the version improved performance from 4% to 14%
cmpaired with memcpy_sse2_unaligned on haswell machine.
case avx_unaligned sse2_unaligned AVX vs SSE2
200i 146833745 168384142 1.146767332
g23 1431207341 1557405243 1.088175835
166i 350901531 379068674 1.08027079
cp-decl 370750774 395890196 1.067806796
c-type 763780824 810806468 1.061569553
expr2 986698539 1067232192 1.081619309
expr 727016829 758953883 1.043928906
s04 1117900758 1185159528 1.060165242
scilab 63309111 66893431 1.05661618
(We will send test patch on memcpy for above cases)
Thanks
Ling
---
In this version our patch is based on commit-id:641aa7b45991b6564a8fa825c681ad6ad1c7721f,
so the comparied result is different with last versions.
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S | 393 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/memmove-avx-unaligned.S | 4 +
sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S | 4 +
5 files changed, 414 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16e..f622429 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,6 +9,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 28d3579..449f75b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+ __memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove.S. */
IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+ __memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -215,6 +219,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+ __memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -224,6 +230,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+ __memcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -232,6 +240,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+ __mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -241,6 +251,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+ __mempcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
new file mode 100644
index 0000000..005cfb7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -0,0 +1,393 @@
+/* memcpy with AVX
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+#ifndef MEMCPY
+# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+#endif
+
+ .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+
+ lea (%rsi, %rdx), %r8
+ lea (%rdi, %rdx), %r9
+ cmp $256, %rdx
+ ja L(256bytesormore)
+ cmp $128, %edx
+ jb L(less_128bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups 0x40(%rsi), %xmm4
+ vmovups 0x50(%rsi), %xmm5
+ vmovups 0x60(%rsi), %xmm6
+ vmovups 0x70(%rsi), %xmm7
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, 0x40(%rdi)
+ vmovups %xmm5, 0x50(%rdi)
+ vmovups %xmm6, 0x60(%rdi)
+ vmovups %xmm7, 0x70(%rdi)
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_128bytes):
+ cmp $64, %edx
+ jb L(less_64bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups -0x40(%r8), %xmm4
+ vmovups -0x30(%r8), %xmm5
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, -0x40(%r9)
+ vmovups %xmm5, -0x30(%r9)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_64bytes):
+ cmp $32, %edx
+ jb L(less_32bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_32bytes):
+ cmp $16, %edx
+ jb L(less_16bytes)
+ vmovups (%rsi), %xmm0
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_16bytes):
+ cmp $8, %edx
+ jb L(less_8bytes)
+ movq (%rsi), %rcx
+ movq -0x08(%r8), %r10
+ movq %rcx, (%rdi)
+ movq %r10, -0x08(%r9)
+ ret
+ ALIGN(4)
+L(less_8bytes):
+ cmp $4, %edx
+ jb L(less_4bytes)
+ mov (%rsi), %ecx
+ mov -0x04(%r8), %edx
+ mov %ecx, (%rdi)
+ mov %edx, -0x04(%r9)
+ ret
+ ALIGN(4)
+L(less_4bytes):
+ cmp $2, %edx
+ jb L(less_2bytes)
+ mov (%rsi), %cx
+ mov -0x02(%r8), %dx
+ mov %cx, (%rdi)
+ mov %dx, -0x02(%r9)
+ ret
+ ALIGN(4)
+L(less_2bytes):
+ cmp $1, %rdx
+ jb L(less_0bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_0bytes):
+ ret
+
+ ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jae L(copy_backward)
+#endif
+ cmp $2048, %rdx
+ jae L(gobble_data_movsb)
+
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups (%rsi), %ymm4
+ mov %rdi, %r10
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r11
+ sub %r10, %r11
+ sub %r11, %rdx
+ add %r11, %rsi
+ sub $0x80, %rdx
+L(goble_128_loop):
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovaps %ymm0, (%rdi)
+ vmovaps %ymm1, 0x20(%rdi)
+ vmovaps %ymm2, 0x40(%rdi)
+ vmovaps %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(goble_128_loop)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r10
+ sub %rdi, %r10
+ cmp %rdx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ cmp %rcx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+ cmp %rcx, %rdx
+ jae L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+ mov %rdx, %rcx
+ rep movsb
+ ret
+
+L(gobble_big_data_fwd):
+ vmovups (%rsi), %ymm4
+ vmovups -0x80(%r8), %xmm5
+ vmovups -0x70(%r8), %xmm6
+ vmovups -0x60(%r8), %xmm7
+ vmovups -0x50(%r8), %xmm8
+ vmovups -0x40(%r8), %xmm9
+ vmovups -0x30(%r8), %xmm10
+ vmovups -0x20(%r8), %xmm11
+ vmovups -0x10(%r8), %xmm12
+ mov %rdi, %r8
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r10
+ sub %r8, %r10
+ sub %r10, %rdx
+ add %r10, %rsi
+ sub $0x80, %rdx
+L(gobble_mem_fwd_loop):
+ prefetchnta 0x1c0(%rsi)
+ prefetchnta 0x280(%rsi)
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm1, 0x20(%rdi)
+ vmovntdq %ymm2, 0x40(%rdi)
+ vmovntdq %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_fwd_loop)
+ sfence
+ vmovups %ymm4, (%r8)
+ vzeroupper
+ vmovups %xmm5, -0x80(%r9)
+ vmovups %xmm6, -0x70(%r9)
+ vmovups %xmm7, -0x60(%r9)
+ vmovups %xmm8, -0x50(%r9)
+ vmovups %xmm9, -0x40(%r9)
+ vmovups %xmm10, -0x30(%r9)
+ vmovups %xmm11, -0x20(%r9)
+ vmovups %xmm12, -0x10(%r9)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+ vmovups (%rsi), %xmm8
+ vmovups 0x10(%rsi), %xmm9
+ vmovups 0x20(%rsi), %xmm10
+ vmovups 0x30(%rsi), %xmm11
+ vmovups 0x40(%rsi), %xmm12
+ vmovups 0x50(%rsi), %xmm13
+ vmovups 0x60(%rsi), %xmm14
+ vmovups 0x70(%rsi), %xmm15
+ mov %rdi, %r9
+ add %rdx, %rsi
+ add %rdx, %rdi
+ vmovups -0x20(%rsi), %ymm4
+ lea -0x20(%rdi), %r10
+ mov %rdi, %r11
+ and $0x1f, %r11
+ xor %r11, %rdi
+ sub %r11, %rsi
+ sub %r11, %rdx
+#ifdef USE_AS_MEMMOVE
+ mov %rdi, %r11
+ sub %rsi, %r11
+ cmp %rdx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ cmp %rcx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ jmp L(gobble_mem_bwd_llc_start)
+#endif
+L(memmove_use_memcpy_bwd):
+ cmp %rcx, %rdx
+ ja L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_llc):
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovaps %ymm0, -0x20(%rdi)
+ vmovaps %ymm1, -0x40(%rdi)
+ vmovaps %ymm2, -0x60(%rdi)
+ vmovaps %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_llc)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+
+L(gobble_big_data_bwd):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_loop):
+ prefetchnta -0x1c0(%rsi)
+ prefetchnta -0x280(%rsi)
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovntdq %ymm0, -0x20(%rdi)
+ vmovntdq %ymm1, -0x40(%rdi)
+ vmovntdq %ymm2, -0x60(%rdi)
+ vmovntdq %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_loop)
+ sfence
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+#endif
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
new file mode 100644
index 0000000..352a2c3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_avx_unaligned
+#define MEMCPY_CHK __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
new file mode 100644
index 0000000..b31394e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_avx_unaligned
+#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
--
1.8.1.4