This is the mail archive of the libc-ports@sources.redhat.com mailing list for the libc-ports project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [patch, mips] Improved memset for MIPS

From: Steve Ellcey <sellcey at mips dot com>
To: "Joseph S. Myers" <joseph at codesourcery dot com>, Carlos O'Donell <carlos at redhat dot com>
Cc: <libc-ports at sourceware dot org>
Date: Wed, 18 Sep 2013 10:40:35 -0700
Subject: Re: [patch, mips] Improved memset for MIPS
Authentication-results: sourceware.org; auth=none
References: <93a232b5-9d0b-4a27-bbb5-16e3ae7c4b89 at BAMAIL02 dot ba dot imgtec dot org> <Pine dot LNX dot 4 dot 64 dot 1309061430150 dot 5886 at digraph dot polyomino dot org dot uk> <1378483039 dot 5770 dot 302 dot camel at ubuntu-sellcey> <Pine dot LNX dot 4 dot 64 dot 1309061603380 dot 8532 at digraph dot polyomino dot org dot uk> <1378486241 dot 5770 dot 327 dot camel at ubuntu-sellcey> <Pine dot LNX dot 4 dot 64 dot 1309061653280 dot 8532 at digraph dot polyomino dot org dot uk>

Here is an updated version of my new MIPS memset.S routine.  I fixed the
format of the comments and the ifdef indenting and I ran 'make check'
and 'make bench' on little endian and big endian systems with the o32,
n32, and n64 ABIs.  The testing did find a bug that my original testing
missed and I have fixed that bug (it involved a negative value as the
constant being set).  Other then that, the only failures I saw were the
expected check-localplt and check-execstack errors.

I don't know if you want to see all the performance results from
bench-memset.out since it has a lot of output, but looking at the
average time for 131072 byte memsets, the original libc in o32 little
endian mode averaged 43732 (seconds I guess) and the new one was
27365.  n32 went from 21886 to 21881 and n64 went from 21882 to 21877.
So the 64 bit numbers only improved a little, but the 32 bit version
shows a very nice improvement.

Steve Ellcey
sellcey@mips.com


2013-09-18  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/memset.S: Change prefetching and add loop unrolling. 
	* sysdeps/mips/mips64/memset.S: Remove.

diff --git a/ports/sysdeps/mips/memset.S b/ports/sysdeps/mips/memset.S
index 85062fe..3e9c88f 100644
--- a/ports/sysdeps/mips/memset.S
+++ b/ports/sysdeps/mips/memset.S
@@ -1,6 +1,5 @@
-/* Copyright (C) 2002-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -16,70 +15,353 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+# include <regdef.h>
+# include <sys/asm.h>
+#endif
 
-	.set	nomips16
+/* Check to see if the MIPS architecture we are compiling for supports
+   prefetching.  */
+
+#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
+# ifndef DISABLE_PREFETCH
+#  define USE_PREFETCH
+# endif
+#endif
+
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
+# ifndef DISABLE_DOUBLE
+#  define USE_DOUBLE
+# endif
+#endif
+
+#ifndef USE_DOUBLE
+# ifndef DISABLE_DOUBLE_ALIGN
+#  define DOUBLE_ALIGN
+# endif
+#endif
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+#  define PTR_ADDIU	daddiu
+# else
+#  define PTR_ADDIU	addiu
+# endif
+#endif
 
-/* void *memset(void *s, int c, size_t n).  */
+/* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+   or PREFETCH_STORE_STREAMED offers a large performance advantage
+   but PREPAREFORSTORE has some special restrictions to consider.
 
-#if __MIPSEB
-# define SWHI	swl		/* high part is left in big-endian	*/
+   Prefetch with the 'prepare for store' hint does not copy a memory
+   location into the cache, it just allocates a cache line and zeros
+   it out.  This means that if you do not write to the entire cache
+   line before writing it out to memory some data will get zero'ed out
+   when the cache line is written back to memory and data will be lost.
+
+   There are ifdef'ed sections of this memcpy to make sure that it does not
+   do prefetches on cache lines that are not going to be completely written.
+   This code is only needed and only used when PREFETCH_STORE_HINT is set to
+   PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+   less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
+   not work correctly.  */
+
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/* If we have not picked out what hints to use at this point use the
+   standard load and store prefetch hints.  */
+# ifndef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+# endif
+
+/* We double everything when USE_DOUBLE is true so we do 2 prefetches to
+   get 64 bytes in that case.  The assumption is that each individual
+   prefetch brings in 32 bytes.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_CHUNK 64
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
+    pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
+# else
+#  define PREFETCH_CHUNK 32
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+# endif
+
+/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
+   than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
+   of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
+   hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
+   used than MAX_PREFETCH_SIZE does not matter.  */
+# define MAX_PREFETCH_SIZE 128
+/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
+   than 5 on a STORE prefetch and that a single prefetch can never be larger
+   than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
+   we actually do two prefetches in that case, one 32 bytes after the other.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
+# else
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
+# endif
+
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
+    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
+/* We cannot handle this because the initial prefetches may fetch bytes that
+   are before the buffer being copied.  We start copies with an offset
+   of 4 so avoid this situation when using PREPAREFORSTORE.  */
+#  error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
+# endif
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMSET_NAME
+# define MEMSET_NAME memset
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+   The C_ prefix stands for CHUNK and is used to avoid macro name
+   conflicts with system header files.  */
+
+#ifdef USE_DOUBLE
+# define C_ST	sd
+# if __MIPSEB
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+# endif
 #else
-# define SWHI	swr		/* high part is right in little-endian	*/
+# define C_ST	sw
+# if __MIPSEB
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+# endif
 #endif
 
-ENTRY (memset)
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
+
+#ifdef ANDROID_CHANGES
+LEAF(MEMSET_NAME,0)
+#else
+LEAF(MEMSET_NAME)
+#endif
+
+	.set	nomips16
 	.set	noreorder
+/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+   size, copy dst pointer to v0 for the return value.  */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+
+/* If memset value is not zero, we copy it to all the bytes in a 32 or 64
+   bit word.  */
+	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
+	PTR_SUBU a3,zero,a0
+	nop
+
+	/* smear byte into 32 or 64 bit word */
+#if ((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2)
+# ifdef USE_DOUBLE
+	dins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	dins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+	dins	a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
+# else
+	ins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	ins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+# endif
+#else
+# ifdef USE_DOUBLE
+        and     a1,0xff
+	dsll	t2,a1,8
+	or	a1,t2
+	dsll	t2,a1,16
+	or	a1,t2
+	dsll	t2,a1,32
+	or	a1,t2
+# else
+        and     a1,0xff
+	sll	t2,a1,8
+	or	a1,t2
+	sll	t2,a1,16
+	or	a1,t2
+# endif
+#endif
+
+/* If the destination address is not aligned do a partial store to get it
+   aligned.  If it is already aligned just jump to L(aligned).  */
+L(set0):
+	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
+	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
+	PTR_SUBU a2,a2,t2
+	C_STHI	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+
+L(aligned):
+/* If USE_DOUBLE is not set we may still want to align the data on a 16
+   byte boundry instead of an 8 byte boundry to maximize the opportunity
+   of proAptive chips to do memory bonding (combining two sequential 4
+   byte stores into one 8 byte store).  We know there are at least 4 bytes
+   left to store or we would have jumped to L(lastb) earlier in the code.  */
+#ifdef DOUBLE_ALIGN
+	andi	t2,a3,4
+	beq	t2,zero,L(double_aligned)
+	PTR_SUBU a2,a2,t2
+	sw	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+L(double_aligned):
+#endif
 
-	slti	t1, a2, 8		# Less than 8?
-	bne	t1, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
-	andi	a1, 0xff		# Avoid problems with bogus arguments
-	sll	t0, a1, 8
-	or	a1, t0
-	sll	t0, a1, 16
-	or	a1, t0			# a1 is now pattern in full word
-
-L(ueven):
-	subu	t0, zero, a0		# Unaligned address?
-	andi	t0, 0x3
-	beq	t0, zero, L(chkw)
-	subu	a2, t0
-	SWHI	a1, 0(a0)		# Yes, handle first unaligned part
-	addu	a0, t0			# Now both a0 and a2 are updated
+/* Now the destination is aligned to (word or double word) aligned address
+   Set a2 to count how many bytes we have to copy after all the 64/128 byte
+   chunks are copied and a3 to the dest pointer after all the 64/128 byte
+   chunks have been copied.  We will loop, incrementing a0 until it equals
+   a3.  */
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+   in this case the a0+x should not be past the "t0-32" address.  This
+   means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+   for x=64 the last "safe" a0 address is "t0-96" In the current version we
+   will use "prefetch hint,128(a0)", so "t0-160" is the limit.  */
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+#endif
+
+L(loop16w):
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(skip_pref)
+	nop
+#endif
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
+L(skip_pref):
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	C_ST	a1,UNIT(8)(a0)
+	C_ST	a1,UNIT(9)(a0)
+	C_ST	a1,UNIT(10)(a0)
+	C_ST	a1,UNIT(11)(a0)
+	C_ST	a1,UNIT(12)(a0)
+	C_ST	a1,UNIT(13)(a0)
+	C_ST	a1,UNIT(14)(a0)
+	C_ST	a1,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(loop16w)
+	nop
+	move	a2,t8
+
+/* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
+   Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
+   jump down to L(chk1w) to handle the tail end of the copy.  */
 L(chkw):
-	andi	t0, a2, 0x7		# Enough left for one loop iteration?
-	beq	t0, a2, L(chkl)
-	subu	a3, a2, t0
-	addu	a3, a0			# a3 is last loop address +1
-	move	a2, t0			# a2 is now # of bytes left after loop
-L(loopw):
-	addiu	a0, 8			# Handle 2 words pr. iteration
-	sw	a1, -8(a0)
-	bne	a0, a3, L(loopw)
-	sw	a1, -4(a0)
-
-L(chkl):
-	andi	t0, a2, 0x4		# Check if there is at least a full
-	beq	t0, zero, L(last8)	#  word remaining after the loop
-	subu	a2, t0
-	sw	a1, 0(a0)		# Yes...
-	addiu	a0, 4
-
-L(last8):
-	blez	a2, L(exit)		# Handle last 8 bytes (if cnt>0)
-	addu	a3, a2, a0		# a3 is last address +1
-L(lst8l):
-	addiu	a0, 1
-	bne	a0, a3, L(lst8l)
-	sb	a1, -1(a0)
-L(exit):
-	j	ra			# Bye, bye
+	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
+	nop
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/* Here we have less than 32(64) bytes to set.  Set up for a loop to
+   copy one word (or double word) at a time.  Set a2 to count how many
+   bytes we have to copy after all the word (or double word) chunks are
+   copied and a3 to the dest pointer after all the (d)word chunks have
+   been copied.  We will loop, incrementing a0 until a0 equals a3.  */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8 byte chunks) */
+L(wordCopy_loop):
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	a1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
+L(lastbloop):
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	a1,-1(a0)
+L(leave):
+	j	ra
 	nop
 
+	.set	at
 	.set	reorder
-END (memset)
-libc_hidden_builtin_def (memset)
+END(MEMSET_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+libc_hidden_builtin_def (MEMSET_NAME)
+# endif
+#endif

Follow-Ups:
- Re: [patch, mips] Improved memset for MIPS
  - From: Carlos O'Donell
- Re: [patch, mips] Improved memset for MIPS
  - From: Joseph S. Myers

References:
- [patch, mips] Improved memset for MIPS
  - From: Steve Ellcey
- Re: [patch, mips] Improved memset for MIPS
  - From: Joseph S. Myers
- Re: [patch, mips] Improved memset for MIPS
  - From: Steve Ellcey
- Re: [patch, mips] Improved memset for MIPS
  - From: Joseph S. Myers
- Re: [patch, mips] Improved memset for MIPS
  - From: Steve Ellcey
- Re: [patch, mips] Improved memset for MIPS
  - From: Joseph S. Myers

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]