This is the mail archive of the libc-ports@sources.redhat.com mailing list for the libc-ports project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Optimize MIPS memcpy


On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
> Forgot to CC libc-ports@ .
> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
> > This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added:
> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
> memcpy.  These optimizations speed up MIPS memcpy by about 10%.
> > 
> > The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
> iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will
> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
> > 
> 
> This might hurt Octeon as the cache line size there is 128 bytes.  Can
> you say which modern MIPS processor which this has been tuned with?  And
> is there a way to not hard code 32 in the assembly but in a macro
> instead.
> 
> Thanks,
> Andrew Pinski

I've been looking at the MIPS memcpy and was planning on submitting a
new version based on the one that MIPS submitted to Android.  It has
prefetching like Maxim's though I found that using the load and 'prepare
for store' hints instead of 'load streaming' and 'store streaming' hints
gave me better results on the 74k and 24k that I did performance testing
on.

This version has more unrolling too and between that and the hints
difference I got a small performance improvement over Maxim's version
when doing small memcpy's and a fairly substantial improvement on large
memcpy's.

I also merged the 32 and 64 bit versions together so we would only have
one copy to maintain.  I haven't tried building it as part of glibc yet,
I have been testing it standalone first and was going to try and
integrate it into glibc and submit it this week or next.  I'll attach it
to this email so folks can look at it and I will see if I can
parameterize the cache line size.  This one also assumes a 32 byte cache
prefetch.

Steve Ellcey
sellcey@mips.com
/*
 * Copyright (c) 2009-2012
 *      MIPS Technologies, Inc., California.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/************************************************************************
 *
 *  memcpy.S
 *  Version: "043009"
 *
 ************************************************************************/


/************************************************************************
 *  Include files
 ************************************************************************/

#ifdef __BIONIC__
#include "machine/asm.h"
#include "machine/regdef.h"
#define ALLOW_OVERLAP
#define USE_PREFETCH
#else
#ifdef _LIBC
#include <sysdep.h>
#define USE_PREFETCH
#endif
#include <regdef.h>
#include <sys/asm.h>
#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
#define PREFETCH
#endif
#if _MIPS_SIM == _ABI64
#define USE_DOUBLE
#endif
#endif



/* Some asm.h files do not have the L macro definition.  */
#ifndef L
#if _MIPS_SIM == _ABIO32
# define L(label) $L ## label
#else
# define L(label) .L ## label
#endif
#endif

/* Some regdef.h files deo not have the PTR_ADDIU macro definition.  */
#ifndef PTR_ADDIU
#ifdef USE_DOUBLE
#define PTR_ADDIU	daddiu
#else
#define PTR_ADDIU	addiu
#endif
#endif


/*
 * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
 * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
 * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
 */

#ifdef USE_PREFETCH
# define PREF_LOAD		0
# define PREF_STORE		1
# define PREF_LOAD_STREAMED	4
# define PREF_STORE_STREAMED	5
# define PREF_LOAD_RETAINED	6
# define PREF_STORE_RETAINED	7
# define PREF_WRITEBACK_INVAL	25
# define PREF_PREPAREFORSTORE	30

/*
 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
 * get 64 bytes in that case.  The assumption is that each individual 
 * prefetch brings in 32 bytes.
 */
#ifdef USE_DOUBLE
# define PREF_CHUNK 64
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
 pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
 pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
#else
# define PREF_CHUNK 32
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREF_LOAD_STREAMED, (chunk)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
#endif
#define PREF_LIMIT (5 * PREF_CHUNK)
#else
# define PREFETCH_FOR_LOAD(offset, reg)
# define PREFETCH_FOR_STORE(offset, reg)
#endif

/* Allow the routine to be named something else if desired.  */
#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif

/* We use these 32/64 bit registers as temporaries to do the copying.  */
#define REG0 t0
#define REG1 t1
#define REG2 t2
#define REG3 t3
#ifdef USE_DOUBLE
#  define REG4 ta0
#  define REG5 ta1
#  define REG6 ta2
#  define REG7 ta3
#else
#  define REG4 t4
#  define REG5 t5
#  define REG6 t6
#  define REG7 t7
#endif

/* We load/store 64 bits at a time when USE_DOUBLE is true.  */
#ifdef USE_DOUBLE
#  define ST	sd
#  define LD	ld
#if __MIPSEB
#  define LDHI	ldl		/* high part is left in big-endian	*/
#  define STHI	sdl		/* high part is left in big-endian	*/
#  define LDLO	ldr		/* low part is right in big-endian	*/
#  define STLO	sdr		/* low part is right in big-endian	*/
#else
#  define LDHI	ldr		/* high part is right in little-endian	*/
#  define STHI	sdr		/* high part is right in little-endian	*/
#  define LDLO	ldl		/* low part is left in little-endian	*/
#  define STLO	sdl		/* low part is left in little-endian	*/
#endif
#else
#  define ST	sw
#  define LD	lw
#if __MIPSEB
#  define LDHI	lwl		/* high part is left in big-endian	*/
#  define STHI	swl		/* high part is left in big-endian	*/
#  define LDLO	lwr		/* low part is right in big-endian	*/
#  define STLO	swr		/* low part is right in big-endian	*/
#else
#  define LDHI	lwr		/* high part is right in little-endian	*/
#  define STHI	swr		/* high part is right in little-endian	*/
#  define LDLO	lwl		/* low part is left in little-endian	*/
#  define STLO	swl		/* low part is left in little-endian	*/
#endif
#endif

/* Bookkeeping values for 32 vs. 64 bit mode.  */
#ifdef USE_DOUBLE
#  define NSIZE 8
#  define NSIZEMASK 0x3f
#  define NSIZEDMASK 0x7f
#else
#  define NSIZE 4
#  define NSIZEMASK 0x1f
#  define NSIZEDMASK 0x3f
#endif
#define UNIT(unit) ((unit)*NSIZE)
#define UNITM1(unit) (((unit)*NSIZE)-1)

#ifdef __BIONIC__
LEAF(MEMCPY_NAME, 0)
#else
LEAF(MEMCPY_NAME)
#endif
	.set	nomips16
	.set	noreorder
/*
 * Below we handle the case where memcpy is called with overlapping src and dst.
 * Although memcpy is not required to handle this case, some parts of Android
 * like Skia rely on such usage. We call memmove to handle such cases.
 */
#ifdef ALLOW_OVERLAP
	PTR_SUBU t0,a0,a1
	PTR_SRA	t2,t0,31
	xor	t1,t0,t2
	PTR_SUBU t0,t1,t2
	sltu	t2,t0,a2
	beq	t2,zero,L(memcpy)
	la	t9,memmove
	jr	t9
	 nop
L(memcpy):
#endif
/*
 * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
 * size, copy dst pointer to v0 for the return value.
 */
	slti	t2,a2,(2 * NSIZE)
	bne	t2,zero,L(lastb)
	move	v0,a0
/*
 * If src and dst have different alignments, go to L(unaligned), if they
 * have the same alignment (but are not actually aligned) do a partial
 * load/store to make them aligned.  If they are both already aligned
 * we can start copying at L(aligned).
 */
	xor	t8,a1,a0
	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
	bne	t8,zero,L(unaligned)
	PTR_SUBU a3, zero, a0

	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */

	LDHI	t8,0(a1)
	PTR_ADDU a1,a1,a3
	STHI	t8,0(a0)
	PTR_ADDU a0,a0,a3

/*
 * Now dst/src are both aligned to (word or double word) aligned addresses
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(aligned):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
 * the "t0-32" address.  This means: for x=128 the last "safe" a0 address is
 * "t0-160".  Alternatively, for x=64 the last "safe" a0 address is "t0-96"
 * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
 * limit
 */
#ifdef USE_PREFETCH
	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREF_LIMIT	/* t9 is the "last safe pref" address */
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
	PREFETCH_FOR_STORE (1, a0)
	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
	bgtz	v1,L(loop16w)
	nop
#endif
	PREFETCH_FOR_STORE (2, a0)
L(loop16w):
	PREFETCH_FOR_LOAD  (3, a1)
	LD	t0,UNIT(0)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(skip_pref30_96)
#endif
	LD	t1,UNIT(1)(a1)
	PREFETCH_FOR_STORE (3, a0)
L(skip_pref30_96):
	LD	REG2,UNIT(2)(a1)
	LD	REG3,UNIT(3)(a1)
	LD	REG4,UNIT(4)(a1)
	LD	REG5,UNIT(5)(a1)
	LD	REG6,UNIT(6)(a1)
	LD	REG7,UNIT(7)(a1)
        PREFETCH_FOR_LOAD (4, a1)

	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)

	LD	t0,UNIT(8)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(skip_pref30_128)
#endif
	LD	t1,UNIT(9)(a1)
	PREFETCH_FOR_STORE (4, a0)
L(skip_pref30_128):
	LD	REG2,UNIT(10)(a1)
	LD	REG3,UNIT(11)(a1)
	LD	REG4,UNIT(12)(a1)
	LD	REG5,UNIT(13)(a1)
	LD	REG6,UNIT(14)(a1)
	LD	REG7,UNIT(15)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	ST	t0,UNIT(8)(a0)
	ST	t1,UNIT(9)(a0)
	ST	REG2,UNIT(10)(a0)
	ST	REG3,UNIT(11)(a0)
	ST	REG4,UNIT(12)(a0)
	ST	REG5,UNIT(13)(a0)
	ST	REG6,UNIT(14)(a0)
	ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
#ifdef USE_PREFETCH
	sltu	v1,t9,a0
#endif
	bne	a0,a3,L(loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
 * the copy.
 */

L(chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
				/* The t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
	nop
	LD	t0,UNIT(0)(a1)
	LD	t1,UNIT(1)(a1)
	LD	REG2,UNIT(2)(a1)
	LD	REG3,UNIT(3)(a1)
	LD	REG4,UNIT(4)(a1)
	LD	REG5,UNIT(5)(a1)
	LD	REG6,UNIT(6)(a1)
	LD	REG7,UNIT(7)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)

/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.  Set a2 to count how many
 * bytes we have to copy after all the word (or double word) chunks are
 * copied and a3 to the dst pointer after all the (d)word chunks have
 * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
 */
L(chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(lastb)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(wordCopy_loop):
	LD	REG3,UNIT(0)(a1)
	PTR_ADDIU a1,a1,UNIT(1)
	PTR_ADDIU a0,a0,UNIT(1)
	bne	a0,a3,L(wordCopy_loop)
	ST	REG3,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(lastb):
	blez	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(lastbloop):
	lb	v1,0(a1)
	PTR_ADDIU a1,a1,1
	PTR_ADDIU a0,a0,1
	bne	a0,a3,L(lastbloop)
	sb	v1,-1(a0)
L(leave):
	j	ra
	nop
/*
 * UNALIGNED case, got here with a3 = "negu a0"
 * This code is nearly identical to the aligned code above
 * but only the destination (not the source) gets aligned
 * so we need to do partial loads of the source followed
 * by normal stores to the destination (once we have aligned
 * the destination).
 */

L(unaligned):
	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */

	LDHI	v1,UNIT(0)(a1)
	LDLO	v1,UNITM1(1)(a1)
	PTR_ADDU a1,a1,a3
	STHI	v1,UNIT(0)(a0)
	PTR_ADDU a0,a0,a3

/*
 *  Now the destination (but not the source) is aligned
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(ua_chk16w):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

#ifdef USE_PREFETCH
	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
	PREFETCH_FOR_STORE (1, a0)
	sltu	v1,t9,a0
	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
	nop
#endif
	PREFETCH_FOR_STORE (2, a0)
L(ua_loop16w):
	PREFETCH_FOR_LOAD  (3, a1)
	LDHI	t0,UNIT(0)(a1)
	LDLO	t0,UNITM1(1)(a1)
	LDHI	t1,UNIT(1)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(ua_skip_pref30_96)
#endif
	LDLO	t1,UNITM1(2)(a1)
	PREFETCH_FOR_STORE (3, a0)
L(ua_skip_pref30_96):
	LDHI	REG2,UNIT(2)(a1)
	LDLO	REG2,UNITM1(3)(a1)
	LDHI	REG3,UNIT(3)(a1)
	LDLO	REG3,UNITM1(4)(a1)
	LDHI	REG4,UNIT(4)(a1)
	LDLO	REG4,UNITM1(5)(a1)
	LDHI	REG5,UNIT(5)(a1)
	LDLO	REG5,UNITM1(6)(a1)
	LDHI	REG6,UNIT(6)(a1)
	LDLO	REG6,UNITM1(7)(a1)
	LDHI	REG7,UNIT(7)(a1)
	LDLO	REG7,UNITM1(8)(a1)
        PREFETCH_FOR_LOAD (4, a1)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	LDHI	t0,UNIT(8)(a1)
	LDLO	t0,UNITM1(9)(a1)
	LDHI	t1,UNIT(9)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(ua_skip_pref30_128)
#endif
	LDLO	t1,UNITM1(10)(a1)
	PREFETCH_FOR_STORE (4, a0)
L(ua_skip_pref30_128):
	LDHI	REG2,UNIT(10)(a1)
	LDLO	REG2,UNITM1(11)(a1)
	LDHI	REG3,UNIT(11)(a1)
	LDLO	REG3,UNITM1(12)(a1)
	LDHI	REG4,UNIT(12)(a1)
	LDLO	REG4,UNITM1(13)(a1)
	LDHI	REG5,UNIT(13)(a1)
	LDLO	REG5,UNITM1(14)(a1)
	LDHI	REG6,UNIT(14)(a1)
	LDLO	REG6,UNITM1(15)(a1)
	LDHI	REG7,UNIT(15)(a1)
	LDLO	REG7,UNITM1(16)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	ST	t0,UNIT(8)(a0)
	ST	t1,UNIT(9)(a0)
	ST	REG2,UNIT(10)(a0)
	ST	REG3,UNIT(11)(a0)
	ST	REG4,UNIT(12)(a0)
	ST	REG5,UNIT(13)(a0)
	ST	REG6,UNIT(14)(a0)
	ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
#ifdef USE_PREFETCH
	sltu	v1,t9,a0
#endif
	bne	a0,a3,L(ua_loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
 * the copy.  */

L(ua_chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
				  /* t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
	nop
	LDHI	t0,UNIT(0)(a1)
	LDLO	t0,UNITM1(1)(a1)
	LDHI	t1,UNIT(1)(a1)
	LDLO	t1,UNITM1(2)(a1)
	LDHI	REG2,UNIT(2)(a1)
	LDLO	REG2,UNITM1(3)(a1)
	LDHI	REG3,UNIT(3)(a1)
	LDLO	REG3,UNITM1(4)(a1)
	LDHI	REG4,UNIT(4)(a1)
	LDLO	REG4,UNITM1(5)(a1)
	LDHI	REG5,UNIT(5)(a1)
	LDLO	REG5,UNITM1(6)(a1)
	LDHI	REG6,UNIT(6)(a1)
	LDLO	REG6,UNITM1(7)(a1)
	LDHI	REG7,UNIT(7)(a1)
	LDLO	REG7,UNITM1(8)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)
/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.
 */
L(ua_chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(ua_smallCopy)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(ua_wordCopy_loop):
	LDHI	v1,UNIT(0)(a1)
	LDLO	v1,UNITM1(1)(a1)
	PTR_ADDIU a1,a1,UNIT(1)
	PTR_ADDIU a0,a0,UNIT(1)
	bne	a0,a3,L(ua_wordCopy_loop)
	ST	v1,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(ua_smallCopy):
	beqz	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(ua_smallCopy_loop):
	lb	v1,0(a1)
	PTR_ADDIU a1,a1,1
	PTR_ADDIU a0,a0,1
	bne	a0,a3,L(ua_smallCopy_loop)
	sb	v1,-1(a0)

	j	ra
	nop

	.set	at
	.set	reorder
END(MEMCPY_NAME)


/************************************************************************
 *  Implementation : Static functions
 ************************************************************************/

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]