This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch, ibm/2.11/master, updated. glibc-2.11.1-71-g0a89b6a

From: rsa at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 14 May 2010 19:32:39 -0000
Subject: GNU C Library master sources branch, ibm/2.11/master, updated. glibc-2.11.1-71-g0a89b6a
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, ibm/2.11/master has been updated
       via  0a89b6a6fac08b42533075d90d8693ec825bdac1 (commit)
      from  dd1f42ad8afb25e6930b0e7251520925208f5bc8 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0a89b6a6fac08b42533075d90d8693ec825bdac1

commit 0a89b6a6fac08b42533075d90d8693ec825bdac1
Author: Luis Machado <luisgpm@br.ibm.com>
Date:   Fri May 14 14:15:22 2010 -0500

    Power7 memset powerpc32 and powerpc64 .S optimizations.

diff --git a/ChangeLog b/ChangeLog
index a16a1fb..4fb7a29 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2010-05-15  Luis Machado  <luisgpm@br.ibm.com>
+	* sysdeps/powerpc/powerpc64/power7/memset.S: New POWER7-optimized
+	64-bit memset.
+	* sysdeps/powerpc/powerpc32/power7/memset.S: New POWER7-optimized
+	32-bit memset.
+
 2010-05-01  Alan Modra  <amodra@gmail.com>
 	* sysdeps/powerpc/powerpc32/power4/memcmp.S: Correct cfi for r24.
 	* sysdeps/powerpc/powerpc64/bsd-_setjmp.S: Move contents..
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
new file mode 100644
index 0000000..990faa1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -0,0 +1,434 @@
+/* Optimized memset implementation for PowerPC32/POWER7.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+   02110-1301 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine power7
+EALIGN (BP_SYM (memset), 5, 0)
+	CALL_MCOUNT
+
+	.align	4
+L(_memset):
+	cmplwi  cr7,5,31
+	cmplwi	cr6,5,8
+	mr	10,3		/* Save original argument for later.  */
+	mr      7,1             /* Save original r1 for later.  */
+	cfi_offset(31,-8)
+
+	/* Replicate byte to word.  */
+	rlwimi  4,4,8,16,23
+	rlwimi  4,4,16,0,15
+
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg     0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	/* Save our word twice to create a doubleword that we will later
+	   copy to a FPR.  */
+	stwu    1,-32(1)
+	andi.   11,10,7         /* Check alignment of DST.  */
+	mr      12,5
+	stw     4,24(1)
+	stw     4,28(1)
+	beq	L(big_aligned)
+
+	clrlwi  0,0,29
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf      31,2f
+
+	stb     4,0(10)
+	addi    10,10,1
+2:	bf      30,4f
+
+	sth     4,0(10)
+	addi    10,10,2
+4:	bf      29,L(big_aligned)
+
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align	4
+L(big_aligned):
+	cmplwi	cr5,5,255
+	li	0,32
+	cmplwi	cr1,5,160
+	dcbtst	0,10
+	cmplwi	cr6,4,0
+	srwi    9,5,3        /* Number of full doublewords remaining.  */
+	crand   27,26,21
+	mtocrf  0x01,9
+	bt      27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srwi    8,5,5
+	clrlwi  11,5,29
+	cmplwi  cr6,11,0
+	cmplwi	cr1,9,4
+	mtctr   8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf      30,1f
+
+	stw     4,0(10)
+	stw     4,4(10)
+	stw     4,8(10)
+	stw     4,12(10)
+	addi    10,10,16
+	bf      31,L(big_loop)
+
+	stw     4,0(10)
+	stw     4,4(10)
+	addi    10,10,8
+	mr	12,10
+	blt     cr1,L(tail_bytes)
+
+	b       L(big_loop)
+
+	.align  4
+1:	/* Copy 1 doubleword.  */
+	bf      31,L(big_loop)
+
+	stw     4,0(10)
+	stw     4,4(10)
+	addi    10,10,8
+
+	/* First use a 32-bytes loop with stw's to try and avoid the LHS due
+	   to the lfd we will do next.  Also, ping-pong through r10 and r12
+	   to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi    12,10,32
+	stw     4,0(10)
+	stw     4,4(10)
+	stw     4,8(10)
+	stw     4,12(10)
+	stw     4,16(10)
+	stw     4,20(10)
+	stw     4,24(10)
+	stw     4,28(10)
+	bdz     L(tail_bytes)
+
+	addi    10,10,64
+	stw     4,0(12)
+	stw     4,4(12)
+	stw     4,8(12)
+	stw     4,12(12)
+	stw     4,16(12)
+	stw     4,20(12)
+	stw     4,24(12)
+	stw     4,28(12)
+	bdnz    L(big_loop_fast_setup)
+
+	mr      12,10
+	b       L(tail_bytes)
+
+	/* Now that we're probably past the LHS window, use the VSX to
+	   speed up the loop.  */
+L(big_loop_fast_setup):
+	li	11,24
+	li	6,16
+	lxvdsx	4,1,11
+
+	.align  4
+L(big_loop_fast):
+	addi	12,10,32
+	stxvd2x	4,10,0
+	stxvd2x	4,10,6
+	bdz     L(tail_bytes)
+
+	addi    10,10,64
+	stxvd2x 4,12,0
+	stxvd2x 4,12,6
+	bdnz    L(big_loop_fast)
+
+	mr	12,10
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	mr	1,7   /* Restore r1.  */
+	beqlr   cr6
+
+	clrlwi	0,5,29
+	mtocrf  0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf      29,2f
+
+	stw     4,0(12)
+	addi    12,12,4
+2:	/* Copy 2 bytes.  */
+	bf      30,1f
+
+	sth     4,0(12)
+	addi    12,12,2
+1:	/* Copy 1 byte.  */
+	bflr      31
+
+	stb     4,0(12)
+	blr
+
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	lfd     4,24(1)
+	andi.	11,10,127
+	neg	0,10
+	beq     L(huge_aligned)
+
+	clrlwi  0,0,25
+	subf    5,0,5
+	srwi	0,0,3
+	mtocrf  0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	stfd	4,0(10)
+	stfd    4,8(10)
+	stfd    4,16(10)
+	stfd    4,24(10)
+	stfd    4,32(10)
+	stfd    4,40(10)
+	stfd    4,48(10)
+	stfd    4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	stfd    4,0(10)
+	stfd    4,8(10)
+	stfd    4,16(10)
+	stfd    4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	stfd    4,0(10)
+	stfd    4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	stfd    4,0(10)
+	addi	10,10,8
+
+L(huge_aligned):
+	srwi    8,5,7
+	clrlwi  11,5,25
+	cmplwi  cr6,11,0
+	mtctr   8
+
+	/* Copies 128-bytes at a time.  */
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi    10,10,128
+	bdnz    L(huge_loop)
+
+	/* We have a tail of 0~127 bytes to handle.  */
+	mr	1,7   /* Restore r1.  */
+	beqlr   cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srwi	8,5,3
+	cmplwi	cr6,8,0
+	mtocrf  0x01,8
+
+	/* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf      28,4f
+
+	stfd    4,0(10)
+	stfd    4,8(10)
+	stfd    4,16(10)
+	stfd    4,24(10)
+	stfd    4,32(10)
+	stfd    4,40(10)
+	stfd    4,48(10)
+	stfd    4,56(10)
+	addi    10,10,64
+	.align  4
+4:	bf      29,2f
+
+	stfd    4,0(10)
+	stfd    4,8(10)
+	stfd    4,16(10)
+	stfd    4,24(10)
+	addi    10,10,32
+	.align  4
+2:	bf      30,1f
+
+	stfd    4,0(10)
+	stfd    4,8(10)
+	addi    10,10,16
+	.align  4
+1:	bf      31,L(tail)
+
+	stfd    4,0(10)
+	addi    10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align  4
+4:	bf      29,2f
+
+	stw     4,0(10)
+	addi    10,10,4
+	.align  4
+2:	bf      30,1f
+
+	sth     4,0(10)
+	addi    10,10,2
+	.align  4
+1:	bflr    31
+
+	stb     4,0(10)
+	blr
+
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb     4,2(10)
+	blr
+
+	.align  4
+L(TFX):	bflr	31
+
+	stb     4,4(10)
+	blr
+
+	.align  4
+L(FFX):	bflr	31
+
+	stb     4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align  4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrlwi  0,0,30
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf  0x01,0
+	subf    5,0,5
+1:	/* Copy 1 byte.  */
+	bf      31,2f
+
+	stb     4,0(10)
+	addi    10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi    10,10,2
+
+	.align  4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmplwi  cr1,5,16
+	mtocrf  0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw     4,0(10)
+	stw     4,4(10)
+	stw     4,8(10)
+	stw     4,12(10)
+	addi    10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw     4,0(10)
+	stw     4,4(10)
+	addi    10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw     4,0(10)
+	addi    10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth     4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align  4
+L(small):
+	mtocrf  0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw     4,0(10)
+	stw     4,4(10)
+	blr
+
+END (BP_SYM (memset))
+libc_hidden_builtin_def (memset)
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000..0f726d4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,398 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine power7
+EALIGN (BP_SYM (memset), 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi  cr7,5,31
+	cmpldi	cr6,5,8
+	mr	10,3
+
+	/* Replicate byte to word.  */
+	rlwimi  4,4,8,16,23
+	rlwimi  4,4,16,0,15
+	ble	cr6, L(small)	/* If length <= 8, use short copy code.  */
+
+	neg     0,3
+	ble	cr7, L(medium)	/* If length < 32, use medium copy code.  */
+
+	andi.   11,10,7		/* Check alignment of SRC.  */
+	insrdi  4,4,32,0	/* Replicate word to double word.  */
+
+	mr	12,5
+	beq	L(big_aligned)
+
+	clrldi  0,0,61
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf      31,2f
+
+	stb     4,0(10)
+	addi    10,10,1
+2:	bf      30,4f
+
+	sth     4,0(10)
+	addi    10,10,2
+4:	bf      29,L(big_aligned)
+
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align	4
+L(big_aligned):
+
+	cmpldi	cr5,5,255
+	li	0,32
+	dcbtst	0,10
+	cmpldi	cr6,4,0
+	srdi    9,5,3        /* Number of full doublewords remaining.  */
+	crand   27,26,21
+	mtocrf  0x01,9
+	bt      27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srdi    8,5,5
+	clrldi  11,5,61
+	cmpldi  cr6,11,0
+	cmpldi	cr1,9,4
+	mtctr   8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf      30,1f
+
+	std     4,0(10)
+	std     4,8(10)
+	addi    10,10,16
+	bf      31,L(big_loop)
+
+	std     4,0(10)
+	addi    10,10,8
+	mr	12,10
+	blt     cr1,L(tail_bytes)
+	b       L(big_loop)
+
+	.align  4
+1:	/* Copy 1 doubleword.  */
+	bf      31,L(big_loop)
+
+	std     4,0(10)
+	addi    10,10,8
+
+	/* Main aligned copy loop.  Copies 32-bytes at a time and
+	   ping-pong through r10 and r12 to avoid AGEN delays.  */
+	.align  4
+L(big_loop):
+	addi	12,10,32
+	std     4,0(10)
+	std     4,8(10)
+	std     4,16(10)
+	std     4,24(10)
+	bdz     L(tail_bytes)
+
+	addi    10,10,64
+	std     4,0(12)
+	std     4,8(12)
+	std     4,16(12)
+	std     4,24(12)
+	bdnz    L(big_loop)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	beqlr   cr6
+
+	clrldi  0,5,61
+	mtocrf  0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf      29,2f
+
+	stw     4,0(12)
+	addi    12,12,4
+2:	/* Copy 2 bytes.  */
+	bf      30,1f
+
+	sth     4,0(12)
+	addi    12,12,2
+1:	/* Copy 1 byte.  */
+	bflr    31
+
+	stb     4,0(12)
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	andi.	11,10,127
+	neg	0,10
+	beq     L(huge_aligned)
+
+	clrldi  0,0,57
+	subf    5,0,5
+	srdi	0,0,3
+	mtocrf  0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	4,0(10)
+	std     4,8(10)
+	std     4,16(10)
+	std     4,24(10)
+	std     4,32(10)
+	std     4,40(10)
+	std     4,48(10)
+	std     4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std     4,0(10)
+	std     4,8(10)
+	std     4,16(10)
+	std     4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std     4,0(10)
+	std     4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	std     4,0(10)
+	addi	10,10,8
+
+
+L(huge_aligned):
+	srdi    8,5,7
+	clrldi  11,5,57
+	cmpldi  cr6,11,0
+	mtctr   8
+
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi    10,10,128
+	bdnz    L(huge_loop)
+
+	/* Check how many bytes are still left.  */
+	beqlr   cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srdi	8,5,3
+	cmpldi	cr6,8,0
+	mtocrf  0x01,8
+
+	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf      28,4f
+
+	std     4,0(10)
+	std     4,8(10)
+	std     4,16(10)
+	std     4,24(10)
+	std     4,32(10)
+	std     4,40(10)
+	std     4,48(10)
+	std     4,56(10)
+	addi    10,10,64
+	.align  4
+4:	bf      29,2f
+
+	std     4,0(10)
+	std     4,8(10)
+	std     4,16(10)
+	std     4,24(10)
+	addi    10,10,32
+	.align  4
+2:	bf      30,1f
+
+	std     4,0(10)
+	std     4,8(10)
+	addi    10,10,16
+	.align  4
+1:	bf      31,L(tail)
+
+	std     4,0(10)
+	addi    10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align  4
+4:	bf      29,2f
+
+	stw     4,0(10)
+	addi    10,10,4
+	.align  4
+2:	bf      30,1f
+
+	sth     4,0(10)
+	addi    10,10,2
+	.align  4
+1:	bflr    31
+
+	stb     4,0(10)
+	blr
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb     4,2(10)
+	blr
+
+	.align  4
+L(TFX):	bflr	31
+
+	stb     4,4(10)
+	blr
+
+	.align  4
+L(FFX):	bflr	31
+
+	stb     4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align  4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrldi  0,0,62
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf  0x01,0
+	subf    5,0,5
+1:	/* Copy 1 byte.  */
+	bf      31,2f
+
+	stb     4,0(10)
+	addi    10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi    10,10,2
+
+	.align  4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmpldi  cr1,5,16
+	mtocrf  0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw     4,0(10)
+	stw     4,4(10)
+	stw     4,8(10)
+	stw     4,12(10)
+	addi    10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw     4,0(10)
+	stw     4,4(10)
+	addi    10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw     4,0(10)
+	addi    10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth     4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align  4
+L(small):
+	mtocrf  0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw     4,0(10)
+	stw     4,4(10)
+	blr
+
+END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (BP_SYM (__bzero))
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
+
+weak_alias (BP_SYM (__bzero), BP_SYM (bzero))

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                 |    6 +
 sysdeps/powerpc/powerpc32/power7/memset.S |  434 +++++++++++++++++++++++++++++
 sysdeps/powerpc/powerpc64/power7/memset.S |  398 ++++++++++++++++++++++++++
 3 files changed, 838 insertions(+), 0 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc32/power7/memset.S
 create mode 100644 sysdeps/powerpc/powerpc64/power7/memset.S


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]