This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] memset.S for PowerPC


Here is the revised memset patch for PowerPC (32-bit).  if this meets 
your approval I'll start revising my PowerPC64 patches into this form.

To test this patch I devised a simple performance test that took parts of
./string/tester.c and added my own in-cache and cache-rollover performance 
tests and timers. 

The before test uses libc.so from the Suse SLES 7.0 distribution which 
appears to use ./sysdeps/generic/memset.c. Presumably so it will work 
correctly on all powerpc platforms. The results are:

start alignment test
unaligned memset      796.875 MB per sec
4kb buffer @0x40029000
page aligned bzero     1448.373 MB per sec
page aligned memset     1469.154 MB per sec
16mb buffer @0x41029000
page aligned bzero      278.564 MB per sec
page aligned memset      279.552 MB per sec

The after (this patch) test results are:

start alignment test
unaligned memset      796.875 MB per sec
4kb buffer @0x40002000
page aligned bzero     2221.258 MB per sec
page aligned memset     1523.810 MB per sec
16mb buffer @0x41002000
page aligned bzero      646.873 MB per sec
page aligned memset      353.469 MB per sec

Both tests where run an IBM 7044-170, POWER3(630+) 400MHz, with 512MB ram. 
This is a 64-bit system running the 64-bit 2.4.19-rc3 Linux kernel.


2002-08-20  Steven Munroe  <sjmunroe@us.ibm.com>
	* sysdeps/powerpc/elf/libc-start.c : Scan Aux Vector for 
	AT_DCACHEBSIZE and copy value to __cache_line_size.
	* sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c : Scan Aux Vector for 
	AT_DCACHEBSIZE and copy value to __cache_line_size.
	* sysdeps/powerpc/memset.S : Define __cache_line_size and use its
	value to select the correct stride for dcbz.
	
>>>>>>>>
diff -rc2P glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c glibc-2.2.5-memset/sysdeps/powerpc/elf/libc-start.c
*** glibc-2.2.5/sysdeps/powerpc/elf/libc-start.c	Thu Jul  5 23:56:01 2001
--- glibc-2.2.5-memset/sysdeps/powerpc/elf/libc-start.c	Wed Aug 21 13:31:27 2002
***************
*** 27,30 ****
--- 27,34 ----
  extern int _dl_starting_up;
  weak_extern (_dl_starting_up)
+ 
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+ 
  extern int __libc_multiple_libcs;
  extern void *__libc_stack_end;
***************
*** 38,41 ****
--- 42,66 ----
  };
  
+ /* Scan the Aux Vector for the "Data Cache Block Size" entry.  If found
+    verify that the static extern __cache_line_size is defined by checking
+    for not NULL.  If it is defined then assign the cache block size 
+    value to __cache_line_size.  */
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+   for (; av->a_type != AT_NULL; ++av)
+     switch (av->a_type)
+       {
+       case AT_DCACHEBSIZE:
+         {
+ 	        int *cls = & __cache_line_size;
+           if (cls != NULL)
+             *cls = av->a_un.a_val;
+ 		    }
+         break;
+       }
+ }
+ 
+ 
  int
  /* GKM FIXME: GCC: this should get __BP_ prefix by virtue of the
***************
*** 43,47 ****
  BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
  		   char *__unbounded *__unbounded ubp_ev,
! 		   void *__unbounded auxvec, void (*rtld_fini) (void),
  		   struct startup_info *__unbounded stinfo,
  		   char *__unbounded *__unbounded stack_on_entry)
--- 68,72 ----
  BP_SYM (__libc_start_main) (int argc, char *__unbounded *__unbounded ubp_av,
  		   char *__unbounded *__unbounded ubp_ev,
! 		   ElfW(auxv_t) *__unbounded auxvec, void (*rtld_fini) (void),
  		   struct startup_info *__unbounded stinfo,
  		   char *__unbounded *__unbounded stack_on_entry)
***************
*** 67,70 ****
--- 92,96 ----
    if (*stack_on_entry != NULL)
      {
+       char *__unbounded *__unbounded temp;
        /* ...in which case, we have argc as the top thing on the
  	 stack, followed by argv (NULL-terminated), envp (likewise),
***************
*** 73,80 ****
        ubp_av = stack_on_entry + 1;
        ubp_ev = ubp_av + argc + 1;
!       auxvec = ubp_ev;
!       while (*(char *__unbounded *__unbounded) auxvec != NULL)
! 	++auxvec;
!       ++auxvec;
  #ifndef SHARED
        _dl_aux_init ((ElfW(auxv_t) *) auxvec);
--- 99,108 ----
        ubp_av = stack_on_entry + 1;
        ubp_ev = ubp_av + argc + 1;
!       temp = ubp_ev;
!       while (*temp != NULL)
!         ++temp;
!       auxvec = (ElfW(auxv_t) *)++temp;
!       
! 
  #ifndef SHARED
        _dl_aux_init ((ElfW(auxv_t) *) auxvec);
***************
*** 84,87 ****
--- 112,118 ----
  
    INIT_ARGV_and_ENVIRON;
+     
+   /* Initialize the __cache_line_size variable from the aux vector.  */
+   __aux_init_cache((ElfW(auxv_t) *) auxvec);
  
    /* Store something that has some relationship to the end of the
diff -rc2P glibc-2.2.5/sysdeps/powerpc/memset.S glibc-2.2.5-memset/sysdeps/powerpc/memset.S
*** glibc-2.2.5/sysdeps/powerpc/memset.S	Thu Jul  5 23:56:01 2001
--- glibc-2.2.5-memset/sysdeps/powerpc/memset.S	Wed Aug 21 12:19:04 2002
***************
*** 22,31 ****
  #include <bp-asm.h>
  
  /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
     Returns 's'.
  
!    The memset is done in three sizes: byte (8 bits), word (32 bits),
!    cache line (256 bits). There is a special case for setting cache lines
!    to 0, to take advantage of the dcbz instruction.  */
  
  EALIGN (BP_SYM (memset), 5, 1)
--- 22,45 ----
  #include <bp-asm.h>
  
+ /* Define a global static that can hold the cache line size.  The 
+    assumption is that startup code will access the "aux vector" to
+    to obtain the value set by the kernel and store it into this 
+    variable. */
+    
+ 	.globl __cache_line_size
+ 	.section	".data","aw"
+ 	.align 2
+ 	.type	 __cache_line_size,@object
+ 	.size	 __cache_line_size,4
+ __cache_line_size:
+ 	.long 0
+ 	.section	".text"
  /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
     Returns 's'.
  
!    The memset is done in four sizes: byte (8 bits), word (32 bits),
!    32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits).
!    There is a special case for setting whole cache lines to 0, which 
!    takes advantage of the dcbz instruction.  */
  
  EALIGN (BP_SYM (memset), 5, 1)
***************
*** 51,54 ****
--- 65,72 ----
  #define rNEG32	r9	/* constant -32 for clearing with dcbz */
  
+ #define rGOT	r9	/* Address of the Global Offset Table.  */
+ #define rCLS	r8	/* Cache line size obtained from static.  */
+ #define rCLM	r9	/* Cache line size mask to check for cache alignment.  */
+ 
  #if __BOUNDED_POINTERS__
  	cmplwi	cr1, rRTN, 0
***************
*** 106,110 ****
  	clrrwi.	rALIGN, rLEN, 5
  	mtcrf	0x01, rLEN	/* 40th instruction from .align */
! 	beq	cr1, L(zloopstart) /* special case for clearing memory using dcbz */
  	srwi	rTMP, rALIGN, 5
  	mtctr	rTMP
--- 124,138 ----
  	clrrwi.	rALIGN, rLEN, 5
  	mtcrf	0x01, rLEN	/* 40th instruction from .align */
! 	
! /* Check if we can use the special case for clearing memory using dcbz.
!    This requires that we know the correct cache line size for this    
!    processor.  Getting the __cache_line_size may require establishing GOT
!    addressability, so branch out of line to set this up.  */
! 	beq	cr1, L(checklinesize) 
! 	
! /* Store blocks of 32-bytes (265-bits) starting on a 32-byte boundary. 
!    Can't assume that rCHR is zero or that the cache line size is either
!    32-bytes or even known.  */
! L(nondcbz):
  	srwi	rTMP, rALIGN, 5
  	mtctr	rTMP
***************
*** 115,119 ****
  	bdz	L(cloopdone)	/* 48th instruction from .align */
  
! L(c3):	dcbz	rNEG64, rMEMP
  	stw	rCHR, -4(rMEMP)
  	stw	rCHR, -8(rMEMP)
--- 143,149 ----
  	bdz	L(cloopdone)	/* 48th instruction from .align */
  
! /* We can't use dcbz here as we don't know the cache line size.  We can
!    use "data cache block touch for store", which is safe.  */
! L(c3):	dcbtst rNEG64, rMEMP
  	stw	rCHR, -4(rMEMP)
  	stw	rCHR, -8(rMEMP)
***************
*** 143,147 ****
  	.align 5
  	nop
! /* Clear lines of memory in 128-byte chunks.  */
  L(zloopstart):
  	clrlwi	rLEN, rLEN, 27
--- 173,180 ----
  	.align 5
  	nop
! /* Clear cache lines of memory in 128-byte chunks.  
!    This code is optimized for processors with 32-byte cache lines.
!    It is further optimized for the 601 processor, which requires
!    some care in how the code is aligned in the i-cache.  */
  L(zloopstart):
  	clrlwi	rLEN, rLEN, 27
***************
*** 227,229 ****
--- 260,338 ----
  	stw	rCHR, -8(rMEMP)
  	blr
+ 	
+ L(checklinesize):
+ #ifdef SHARED
+ 	mflr rTMP
+ /* If the remaining length is less the 32 bytes then don't bother getting
+ 	 the cache line size.  */
+ 	beq	L(medium)	
+ /* Establishes GOT addressability so we can load __cache_line_size 
+    from static. This value was set from the aux vector during startup.  */
+ 	bl   _GLOBAL_OFFSET_TABLE_@local-4
+ 	mflr rGOT
+ 	lwz	 rGOT,__cache_line_size@got(rGOT)
+ 	lwz	 rCLS,0(rGOT)
+ 	mtlr rTMP
+ #else 
+ /* Load __cache_line_size from static. This value was set from the 
+    aux vector during startup.  */
+ 	lis	 rCLS,__cache_line_size@ha
+ /* If the remaining length is less the 32 bytes then don't bother getting
+ 	 the cache line size.  */
+ 	beq	L(medium)
+ 	lwz  rCLS,__cache_line_size@l(rCLS)
+ #endif
+ 	
+ /*If the cache line size was not set then goto to L(nondcbz), which is
+ 	safe for any cache line size.  */	
+ 	cmplwi cr1,rCLS,0
+ 	beq	cr1,L(nondcbz)
+ 	
+ /* If the cache line size is 32 bytes then goto to L(zloopstart),
+ 	 which is coded specificly for 32-byte lines (and 601).  */	
+ 	cmplwi cr1,rCLS,32
+ 	beq	cr1,L(zloopstart)
+ 	
+ /* Now we know the cache line size and it is not 32-bytes.  However 
+ 	 we may not yet be aligned to the cache line and may have a partial 
+ 	 line to fill.  Touch it 1st to fetch the cache line.  */	
+ 	dcbtst 0,rMEMP	
+ 	
+ 	addi rCLM,rCLS,-1
+ L(getCacheAligned):
+ 	cmplwi cr1,rLEN,32
+ 	and. rTMP,rCLM,rMEMP
+ 	blt	 cr1,L(handletail32)
+ 	beq	 L(cacheAligned)
+ /* We are not aligned to start of a cache line yet.  Store 32-byte
+    of data and test again.  */
+ 	addi rMEMP,rMEMP,32
+ 	addi rLEN,rLEN,-32
+ 	stw	 rCHR,-32(rMEMP)
+ 	stw	 rCHR,-28(rMEMP)
+ 	stw	 rCHR,-24(rMEMP)
+ 	stw	 rCHR,-20(rMEMP)
+ 	stw	 rCHR,-16(rMEMP)
+ 	stw	 rCHR,-12(rMEMP)
+ 	stw	 rCHR,-8(rMEMP)
+ 	stw	 rCHR,-4(rMEMP)
+ 	b	 L(getCacheAligned)
+ 	
+ /* Now we are aligned to the cache line and can use dcbz.  */	
+ L(cacheAligned):
+ 	cmplw cr1,rLEN,rCLS
+ 	blt	 cr1,L(handletail32)
+ 	dcbz 0,rMEMP
+ 	subf rLEN,rCLS,rLEN
+ 	add	 rMEMP,rMEMP,rCLS
+ 	b	 L(cacheAligned)
+ 
+ /* We are here because; the cache line size was set, it was not 
+    32-bytes, and the remainder (rLEN) is now less than the actual cache 
+    line size.  Set up the preconditions for L(nondcbz) and go there to 
+    store the remaining bytes.  */			
+ L(handletail32):
+ 	clrrwi.	rALIGN, rLEN, 5
+ 	b		L(nondcbz)
+ 		
  END (BP_SYM (memset))
diff -rc2P glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c glibc-2.2.5-memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c
*** glibc-2.2.5/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c	Thu Jul  5 23:56:19 2001
--- glibc-2.2.5-memset/sysdeps/unix/sysv/linux/powerpc/dl-sysdep.c	Wed Aug 21 13:02:54 2002
***************
*** 21,24 ****
--- 21,50 ----
  #include "config.h"
  #include "kernel-features.h"
+ #include <ldsodefs.h>
+ 
+ extern int __cache_line_size;
+ weak_extern (__cache_line_size)
+ 
+ #define DL_PLATFORM_INIT __aux_init_cache(_dl_auxv)
+ 
+ /* Scan the Aux Vector for the "Data Cache Block Size" entry.  If found
+    verify that the static extern __cache_line_size is defined by checking
+    for not NULL.  If it is defined then assign the cache block size 
+    value to __cache_line_size.  */
+ static inline void
+ __aux_init_cache (ElfW(auxv_t) *av)
+ {
+   for (; av->a_type != AT_NULL; ++av)
+     switch (av->a_type)
+       {
+         case AT_DCACHEBSIZE:
+       	  {
+ 			      int *cls = & __cache_line_size;
+             if (cls != NULL)
+               *cls = av->a_un.a_val;
+           }
+ 		    break;
+       }
+ }
  
  #ifndef __ASSUME_STD_AUXV


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]