This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.10-153-g24a12a5
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 16 Jul 2009 14:02:46 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.10-153-g24a12a5
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 24a12a5a5f7ea63bc349f219b9fbb722c009a719 (commit)
via e26c9b84155f31b37730fec7621f1d9a805b314d (commit)
from ca419225a3c4f9f341eddf582b201211d1bf2aec (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=24a12a5a5f7ea63bc349f219b9fbb722c009a719
commit 24a12a5a5f7ea63bc349f219b9fbb722c009a719
Author: Ulrich Drepper <drepper@redhat.com>
Date: Thu Jul 16 07:02:27 2009 -0700
Fix up whitespaces in new memcmp for x86-64.
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index 165f42e..a9fe13a 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -26,7 +26,7 @@ ENTRY (memcmp)
jz L(finz)
cmpq $1, %rdx
jle L(finr1b)
- subq %rdi, %rsi
+ subq %rdi, %rsi
movq %rdx, %r10
cmpq $32, %r10
jge L(gt32)
@@ -37,7 +37,7 @@ L(small):
movzbl (%rdi), %eax
movzbl (%rdi, %rsi), %edx
subq $1, %r10
- je L(finz1)
+ je L(finz1)
addq $1, %rdi
subl %edx, %eax
jnz L(exit)
@@ -47,7 +47,7 @@ L(s2b):
movzwl (%rdi), %eax
movzwl (%rdi, %rsi), %edx
subq $2, %r10
- je L(fin2_7)
+ je L(fin2_7)
addq $2, %rdi
cmpl %edx, %eax
jnz L(fin2_7)
@@ -57,7 +57,7 @@ L(s4b):
movl (%rdi), %eax
movl (%rdi, %rsi), %edx
subq $4, %r10
- je L(fin2_7)
+ je L(fin2_7)
addq $4, %rdi
cmpl %edx, %eax
jnz L(fin2_7)
@@ -67,7 +67,7 @@ L(s8b):
movq (%rdi), %rax
movq (%rdi, %rsi), %rdx
subq $8, %r10
- je L(fin2_7)
+ je L(fin2_7)
addq $8, %rdi
cmpq %rdx, %rax
jnz L(fin2_7)
@@ -76,11 +76,11 @@ L(s16b):
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %edx
- xorl %eax, %eax
+ xorl %eax, %eax
subl $0xffff, %edx
jz L(finz)
- bsfl %edx, %ecx
- leaq (%rdi, %rcx), %rcx
+ bsfl %edx, %ecx
+ leaq (%rdi, %rcx), %rcx
movzbl (%rcx), %eax
movzbl (%rsi, %rcx), %edx
jmp L(finz1)
@@ -88,7 +88,7 @@ L(s16b):
.p2align 4,, 4
L(finr1b):
movzbl (%rdi), %eax
- movzbl (%rsi), %edx
+ movzbl (%rsi), %edx
L(finz1):
subl %edx, %eax
L(exit):
@@ -98,24 +98,24 @@ L(exit):
L(fin2_7):
cmpq %rdx, %rax
jz L(finz)
- movq %rax, %r11
- subq %rdx, %r11
+ movq %rax, %r11
+ subq %rdx, %r11
bsfq %r11, %rcx
- sarq $3, %rcx
+ sarq $3, %rcx
salq $3, %rcx
- sarq %cl, %rax
+ sarq %cl, %rax
movzbl %al, %eax
- sarq %cl, %rdx
+ sarq %cl, %rdx
movzbl %dl, %edx
subl %edx, %eax
- ret
+ ret
.p2align 4,, 4
L(finz):
xorl %eax, %eax
ret
- /* For blocks bigger than 32 bytes
+ /* For blocks bigger than 32 bytes
1. Advance one of the addr pointer to be 16B aligned.
2. Treat the case of both addr pointers aligned to 16B
separately to avoid movdqu.
@@ -128,10 +128,10 @@ L(finz):
L(gt32):
movq %rdx, %r11
addq %rdi, %r11
- movq %rdi, %r8
+ movq %rdi, %r8
andq $15, %r8
- jz L(16am)
+ jz L(16am)
/* Both pointers may be misaligned. */
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
@@ -156,8 +156,8 @@ L(16am):
L(A32):
movq %r11, %r10
andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
+ cmpq %r10, %rdi
+ jge L(mt16)
/* Pre-unroll to be ready for unrolled 64B loop. */
testq $32, %rdi
jz L(A64)
@@ -167,7 +167,7 @@ L(A32):
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
-
+
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
@@ -178,9 +178,9 @@ L(A32):
L(A64):
movq %r11, %r10
andq $-64, %r10
- cmpq %r10, %rdi
- jge L(mt32)
-
+ cmpq %r10, %rdi
+ jge L(mt32)
+
L(A64main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
@@ -188,7 +188,7 @@ L(A64main):
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
-
+
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
@@ -216,8 +216,8 @@ L(A64main):
L(mt32):
movq %r11, %r10
andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
+ cmpq %r10, %rdi
+ jge L(mt16)
L(A32main):
movdqu (%rdi,%rsi), %xmm0
@@ -226,7 +226,7 @@ L(A32main):
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
-
+
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
@@ -239,23 +239,23 @@ L(A32main):
L(mt16):
subq %rdi, %r11
je L(finz)
- movq %r11, %r10
- jmp L(small)
+ movq %r11, %r10
+ jmp L(small)
.p2align 4,, 4
L(neq):
- bsfl %edx, %ecx
+ bsfl %edx, %ecx
movzbl (%rdi, %rcx), %eax
- addq %rdi, %rsi
+ addq %rdi, %rsi
movzbl (%rsi,%rcx), %edx
jmp L(finz1)
.p2align 4,, 4
L(ATR):
movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
testq $16, %rdi
jz L(ATR32)
@@ -290,7 +290,7 @@ L(ATR32):
L(ATR64):
cmpq %rdi, %r10
- je L(mt32)
+ je L(mt32)
L(ATR64main):
movdqa (%rdi,%rsi), %xmm0
@@ -324,9 +324,9 @@ L(ATR64main):
jne L(ATR64main)
movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
L(ATR32res):
movdqa (%rdi,%rsi), %xmm0
@@ -343,13 +343,13 @@ L(ATR32res):
jnz L(neq)
addq $16, %rdi
- cmpq %r10, %rdi
+ cmpq %r10, %rdi
jne L(ATR32res)
subq %rdi, %r11
je L(finz)
- movq %r11, %r10
- jmp L(small)
+ movq %r11, %r10
+ jmp L(small)
/* Align to 16byte to improve instruction fetch. */
.p2align 4,, 4
END(memcmp)
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e26c9b84155f31b37730fec7621f1d9a805b314d
commit e26c9b84155f31b37730fec7621f1d9a805b314d
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Thu Jul 16 07:00:34 2009 -0700
memcmp implementation for x86-64 using SSE2.
diff --git a/ChangeLog b/ChangeLog
index c355ea4..87db19e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2009-07-15 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/memcmp.S: New file.
+
2009-07-15 Ulrich Drepper <drepper@redhat.com>
* sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into...
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
new file mode 100644
index 0000000..165f42e
--- /dev/null
+++ b/sysdeps/x86_64/memcmp.S
@@ -0,0 +1,359 @@
+/* memcmp with SSE2
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (memcmp)
+ test %rdx, %rdx
+ jz L(finz)
+ cmpq $1, %rdx
+ jle L(finr1b)
+ subq %rdi, %rsi
+ movq %rdx, %r10
+ cmpq $32, %r10
+ jge L(gt32)
+ /* Handle small chunks and last block of less than 32 bytes. */
+L(small):
+ testq $1, %r10
+ jz L(s2b)
+ movzbl (%rdi), %eax
+ movzbl (%rdi, %rsi), %edx
+ subq $1, %r10
+ je L(finz1)
+ addq $1, %rdi
+ subl %edx, %eax
+ jnz L(exit)
+L(s2b):
+ testq $2, %r10
+ jz L(s4b)
+ movzwl (%rdi), %eax
+ movzwl (%rdi, %rsi), %edx
+ subq $2, %r10
+ je L(fin2_7)
+ addq $2, %rdi
+ cmpl %edx, %eax
+ jnz L(fin2_7)
+L(s4b):
+ testq $4, %r10
+ jz L(s8b)
+ movl (%rdi), %eax
+ movl (%rdi, %rsi), %edx
+ subq $4, %r10
+ je L(fin2_7)
+ addq $4, %rdi
+ cmpl %edx, %eax
+ jnz L(fin2_7)
+L(s8b):
+ testq $8, %r10
+ jz L(s16b)
+ movq (%rdi), %rax
+ movq (%rdi, %rsi), %rdx
+ subq $8, %r10
+ je L(fin2_7)
+ addq $8, %rdi
+ cmpq %rdx, %rax
+ jnz L(fin2_7)
+L(s16b):
+ movdqu (%rdi), %xmm1
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ xorl %eax, %eax
+ subl $0xffff, %edx
+ jz L(finz)
+ bsfl %edx, %ecx
+ leaq (%rdi, %rcx), %rcx
+ movzbl (%rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ jmp L(finz1)
+
+ .p2align 4,, 4
+L(finr1b):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %edx
+L(finz1):
+ subl %edx, %eax
+L(exit):
+ ret
+
+ .p2align 4,, 4
+L(fin2_7):
+ cmpq %rdx, %rax
+ jz L(finz)
+ movq %rax, %r11
+ subq %rdx, %r11
+ bsfq %r11, %rcx
+ sarq $3, %rcx
+ salq $3, %rcx
+ sarq %cl, %rax
+ movzbl %al, %eax
+ sarq %cl, %rdx
+ movzbl %dl, %edx
+ subl %edx, %eax
+ ret
+
+ .p2align 4,, 4
+L(finz):
+ xorl %eax, %eax
+ ret
+
+ /* For blocks bigger than 32 bytes
+ 1. Advance one of the addr pointer to be 16B aligned.
+ 2. Treat the case of both addr pointers aligned to 16B
+ separately to avoid movdqu.
+ 3. Handle any blocks of greater than 64 consecutive bytes with
+ unrolling to reduce branches.
+ 4. At least one addr pointer is 16B aligned, use memory version
+ of pcmbeqb.
+ */
+ .p2align 4,, 4
+L(gt32):
+ movq %rdx, %r11
+ addq %rdi, %r11
+ movq %rdi, %r8
+
+ andq $15, %r8
+ jz L(16am)
+ /* Both pointers may be misaligned. */
+ movdqu (%rdi), %xmm1
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ neg %r8
+ leaq 16(%rdi, %r8), %rdi
+L(16am):
+ /* Handle two 16B aligned pointers separately. */
+ testq $15, %rsi
+ jz L(ATR)
+ testq $16, %rdi
+ jz L(A32)
+ movdqu (%rdi, %rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+L(A32):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+ /* Pre-unroll to be ready for unrolled 64B loop. */
+ testq $32, %rdi
+ jz L(A64)
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+L(A64):
+ movq %r11, %r10
+ andq $-64, %r10
+ cmpq %r10, %rdi
+ jge L(mt32)
+
+L(A64main):
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %rdi, %r10
+ jne L(A64main)
+
+L(mt32):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+
+L(A32main):
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqu (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %rdi, %r10
+ jne L(A32main)
+L(mt16):
+ subq %rdi, %r11
+ je L(finz)
+ movq %r11, %r10
+ jmp L(small)
+
+ .p2align 4,, 4
+L(neq):
+ bsfl %edx, %ecx
+ movzbl (%rdi, %rcx), %eax
+ addq %rdi, %rsi
+ movzbl (%rsi,%rcx), %edx
+ jmp L(finz1)
+
+ .p2align 4,, 4
+L(ATR):
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+ testq $16, %rdi
+ jz L(ATR32)
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+ cmpq %rdi, %r10
+ je L(mt16)
+
+L(ATR32):
+ movq %r11, %r10
+ andq $-64, %r10
+ testq $32, %rdi
+ jz L(ATR64)
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+L(ATR64):
+ cmpq %rdi, %r10
+ je L(mt32)
+
+L(ATR64main):
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+ cmpq %rdi, %r10
+ jne L(ATR64main)
+
+ movq %r11, %r10
+ andq $-32, %r10
+ cmpq %r10, %rdi
+ jge L(mt16)
+
+L(ATR32res):
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ movdqa (%rdi,%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+ pmovmskb %xmm0, %edx
+ subl $0xffff, %edx
+ jnz L(neq)
+ addq $16, %rdi
+
+ cmpq %r10, %rdi
+ jne L(ATR32res)
+
+ subq %rdi, %r11
+ je L(finz)
+ movq %r11, %r10
+ jmp L(small)
+ /* Align to 16byte to improve instruction fetch. */
+ .p2align 4,, 4
+END(memcmp)
+
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 4 +
sysdeps/x86_64/memcmp.S | 359 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 363 insertions(+), 0 deletions(-)
create mode 100644 sysdeps/x86_64/memcmp.S
hooks/post-receive
--
GNU C Library master sources