This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Rewritten v9/64-bit sparc strcmp.


This new code is heavily inspired by the powerpc 64-bit base strcmp.

It's faster than the existing code, especially on Niagara cpus as
the number of branches has been minimized to reduce cpu thread
switching.

On UltraSPARC-3 the tail code executes in a constant 5 cycles,
regardless of where the mismatching/zero byte is.  The main aligned
loop executes in 4 cycles, which with a 2 cycle load latency is
essentially optimal.

Committed to master.

---
 ChangeLog                      |    4 +
 sysdeps/sparc/sparc64/strcmp.S |  416 ++++++++++++++++------------------------
 2 files changed, 173 insertions(+), 247 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4fde8c2..ab754e5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-08-24  David S. Miller  <davem@davemloft.net>
+
+	* sysdeps/sparc/sparc64/strcmp.S: Rewrite.
+
 2011-08-24  Andreas Schwab  <schwab@redhat.com>
 
 	* elf/Makefile: Add rules to build and run unload8 test.
diff --git a/sysdeps/sparc/sparc64/strcmp.S b/sysdeps/sparc/sparc64/strcmp.S
index fade4c4..263bb40 100644
--- a/sysdeps/sparc/sparc64/strcmp.S
+++ b/sysdeps/sparc/sparc64/strcmp.S
@@ -1,9 +1,8 @@
 /* Compare two strings for differences.
    For SPARC v9.
-   Copyright (C) 1997, 1999, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
+   Contributed by David S. Miller <davem@davemloft.net>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -22,259 +21,182 @@
 
 #include <sysdep.h>
 #include <asm/asi.h>
+
 #ifndef XCC
 	.register	%g2, #scratch
 	.register	%g3, #scratch
 	.register	%g6, #scratch
 #endif
 
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
+#define rSTR1		%o0
+#define rSTR2		%o1
+#define r0101		%o2	/* 0x0101010101010101 */
+#define r8080		%o3	/* 0x8080808080808080 */
+#define rSTRXOR		%o4
+#define rWORD1		%o5
+#define rTMP1		%g1
+#define rTMP2		%g2
+#define rWORD2		%g3
+#define rSLL		%g4
+#define rSRL		%g5
+#define rBARREL		%g6
+
+	/* There are two cases, either the two pointers are aligned
+	 * identically or they are not.  If they have the same
+	 * alignment we can use the normal full speed loop.  Otherwise
+	 * we have to use the barrel-shifter version.
 	 */
 
 	.text
-	.align		32
+	.align	32
 ENTRY(strcmp)
-	sethi		%hi(0x01010101), %g1			/* IEU0		Group		*/
-	andcc		%o0, 7, %g0				/* IEU1				*/
-	bne,pn		%icc, 7f				/* CTI				*/
-	 or		%g1, %lo(0x01010101), %g1		/* IEU0		Group		*/
-
-	andcc		%o1, 7, %g3				/* IEU1				*/
-	bne,pn		%icc, 9f				/* CTI				*/
-	 sllx		%g1, 32, %g2				/* IEU0		Group		*/
-	ldx		[%o0], %o2				/* Load				*/
-
-	or		%g1, %g2, %g1				/* IEU0		Group		*/
-1:	ldx		[%o1], %o3				/* Load				*/
-	sub		%o1, %o0, %o1				/* IEU1				*/
-	sllx		%g1, 7, %g2				/* IEU0		Group		*/
-
-2:	add		%o0, 8, %o0				/* IEU1				*/
-	sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	subcc		%o2, %o3, %g0				/* IEU1				*/
-	bne,pn		%xcc, 13f				/* CTI				*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	 andn		%g3, %o2, %g4				/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o2			/* Load				*/
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-#else
-	 ldxa		[%o0] ASI_PNF, %o2			/* Load		Group		*/
-	andcc		%g3, %g2, %g0				/* IEU1				*/
-#endif
-	be,a,pt		%xcc, 2b				/* CTI				*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %o3		/* Load		Group		*/
-
-	addcc		%g3, %g1, %o4				/* IEU1				*/
-	srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 3f				/* CTI				*/
-
-	 srlx		%o4, 56, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 48, %o5				/* IEU0				*/
-
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 40, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 32, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-
-3:	 srlx		%o4, 24, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 16, %o5				/* IEU0				*/
-
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 8, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4f				/* CTI				*/
-	 andcc		%o4, 0xff, %g0				/* IEU1		Group		*/
-	bne,a,pn	%icc, 2b				/* CTI				*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %o3		/* Load				*/
-
-4:	retl							/* CTI+IEU1	Group		*/
-	 clr		%o0					/* IEU0				*/
-
-	.align		32
-13:	mov		0xff, %g6				/* IEU0		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	andcc		%g4, %g2, %g0				/* IEU1				*/
-#else
-	andcc		%g3, %g2, %g0				/* IEU1				*/
-#endif
-	be,pt		%xcc, 25f				/* CTI				*/
-	 addcc		%g3, %g1, %o4				/* IEU1		Group		*/
-
-	srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 23f				/* CTI				*/
-	 sllx		%g6, 56, %o5				/* IEU0				*/
-
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-	 sllx		%g6, 48, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-
-	be,pn		%xcc, 24f				/* CTI				*/
-	 sllx		%g6, 40, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-
-	 sllx		%g6, 32, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-23:	 sllx		%g6, 24, %o5				/* IEU0				*/
-
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 24f				/* CTI				*/
-	 sllx		%g6, 16, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 24f				/* CTI				*/
-	 sllx		%g6, 8, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 24f				/* CTI				*/
-
-	 mov		%g6, %o5				/* IEU0				*/
-25:	cmp		%o4, %o3				/* IEU1		Group		*/
-5:	mov		-1, %o0					/* IEU0				*/
-	retl							/* CTI+IEU1	Group		*/
-
-	 movgu		%xcc, 1, %o0				/* Single	Group		*/
-
-	.align		16
-24:	sub		%o5, 1, %g6				/* IEU0		Group		*/
-	clr		%o0					/* IEU1				*/
-	or		%o5, %g6, %o5				/* IEU0		Group		*/
-	andn		%o4, %o5, %o4				/* IEU0		Group		*/
-
-	andn		%o3, %o5, %o3				/* IEU1				*/
-	cmp		%o4, %o3				/* IEU1		Group		*/
-	movgu		%xcc, 1, %o0				/* Single	Group		*/
-	retl							/* CTI+IEU1	Group		*/
-
-	 movlu		%xcc, -1, %o0				/* Single	Group		*/
-6:	retl							/* CTI+IEU1	Group		*/
-	 mov		%o4, %o0				/* IEU0				*/
-
-	.align		16
-7:	ldub		[%o0], %o2				/* Load				*/
-	add		%o0, 1, %o0				/* IEU1				*/
-	ldub		[%o1], %o3				/* Load		Group		*/
-	sllx		%g1, 32, %g2				/* IEU0				*/
-
-8:	add		%o1, 1, %o1				/* IEU1				*/
-	subcc		%o2, %o3, %o4				/* IEU1		Group		*/
-	bne,pn		%xcc, 6b				/* CTI				*/
-	 lduba		[%o0] ASI_PNF, %o2			/* Load				*/
-
-	brz,pn		%o3, 4b					/* CTI+IEU1	Group		*/
-	 lduba		[%o1] ASI_PNF, %o3			/* Load				*/
-	andcc		%o0, 7, %g0				/* IEU1		Group		*/
-	bne,a,pn	%icc, 8b				/* CTI				*/
-
-	 add		%o0, 1, %o0				/* IEU0				*/
-	or		%g1, %g2, %g1				/* IEU0		Group		*/
-	andcc		%o1, 7, %g3				/* IEU1				*/
-	be,a,pn		%icc, 1b				/* CTI				*/
-
-	 ldxa		[%o0] ASI_PNF, %o2			/* Load		Group		*/
-9:	sllx		%g3, 3, %g5				/* IEU0				*/
-	mov		64, %o5					/* IEU1				*/
-	sub		%o1, %g3, %o1				/* IEU0		Group		*/
-
-	sub		%o5, %g5, %o5				/* IEU1				*/
-	ldxa		[%o1] ASI_PNF, %g6			/* Load		Group		*/
-	or		%g1, %g2, %g1				/* IEU0				*/
-	sub		%o1, %o0, %o1				/* IEU1				*/
-
-	sllx		%g1, 7, %g2				/* IEU0		Group		*/
-	add		%o1, 8, %o1				/* IEU1				*/
-								/* %g1 = 0101010101010101
-								 * %g2 = 8080808080800880
-								 * %g5 = number of bits to shift left
-								 * %o5 = number of bits to shift right */
-10:	sllx		%g6, %g5, %o3				/* IEU0		Group		*/
-	ldxa		[%o1 + %o0] ASI_PNF, %g6		/* Load				*/
-
-11:	srlx		%g6, %o5, %o4				/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o2			/* Load				*/
-	or		%o3, %o4, %o3				/* IEU1				*/
-	add		%o0, 8, %o0				/* IEU0		Group		*/
-
-	subcc		%o2, %o3, %g0				/* IEU1				*/
-#ifdef EIGHTBIT_NOT_RARE
-	sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	bne,pn		%xcc, 13b				/* CTI				*/
-	 andn		%g3, %o2, %g4				/* IEU0		Group		*/
-
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 10b				/* CTI				*/
-	 srlx		%g4, 32, %g4				/* IEU0				*/
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-#else
-	bne,pn		%xcc, 13b				/* CTI				*/
-	 sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-
-	be,pt		%xcc, 10b				/* CTI				*/
-	 srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-#endif
-	be,pt		%xcc, 12f				/* CTI				*/
-
-	 srlx		%o2, 56, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 48, %g3				/* IEU0				*/
-
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 40, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 32, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-
-12:	 srlx		%o2, 24, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 16, %g3				/* IEU0				*/
-
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 8, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4b				/* CTI				*/
-	 andcc		%o2, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 sllx		%g6, %g5, %o3				/* IEU0				*/
-
-	ba,pt		%xcc, 11b				/* CTI		Group		*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %g6		/* Load				*/
+	or	rSTR2, rSTR1, rTMP1
+	sethi	%hi(0x80808080), r8080
+
+	andcc	rTMP1, 0x7, %g0
+	bne,pn	%icc, .Lmaybe_barrel_shift
+	 or	r8080, %lo(0x80808080), r8080
+	ldx	[rSTR1], rWORD1
+
+	sub	rSTR2, rSTR1, rSTR2
+	sllx	r8080, 32, rTMP1
+
+	ldx	[rSTR1 + rSTR2], rWORD2
+	or	r8080, rTMP1, r8080
+
+	ba,pt	%xcc, .Laligned_loop_entry
+	 srlx	r8080, 7, r0101
+
+	.align	32
+.Laligned_loop_entry:
+.Laligned_loop:
+	add	rSTR1, 8, rSTR1
+
+	sub	rWORD1, r0101, rTMP2
+	xorcc	rWORD1, rWORD2, rSTRXOR
+	bne,pn	%xcc, .Lcommon_endstring
+
+	 andn	r8080, rWORD1, rTMP1
+
+	ldxa	[rSTR1] ASI_PNF, rWORD1
+	andcc	rTMP1, rTMP2, %g0
+	be,a,pt	%xcc, .Laligned_loop
+
+	 ldxa	[rSTR1 + rSTR2] ASI_PNF, rWORD2
+
+.Lcommon_equal:
+	retl
+	 mov	0, %o0
+
+	/* All loops terminate here once they find an unequal word.
+	 * If a zero byte appears in the word before the first unequal
+	 * byte, we must report zero.  Otherwise we report '1' or '-1'
+	 * depending upon whether the first mis-matching byte is larger
+	 * in the first string or the second, respectively.
+	 *
+	 * First we compute a 64-bit mask value that has "0x01" in
+	 * each byte where a zero exists in rWORD1.  rSTRXOR holds the
+	 * value (rWORD1 ^ rWORD2).  Therefore, if considered as an
+	 * unsigned quantity, our "0x01" mask value is "greater than"
+	 * rSTRXOR then a zero terminating byte comes first and
+	 * therefore we report '0'.
+	 *
+	 * The formula for this mask is:
+	 *
+	 *    mask_tmp1 = ~rWORD1 & 0x8080808080808080;
+	 *    mask_tmp2 = ((rWORD1 & 0x7f7f7f7f7f7f7f7f) +
+	 *                 0x7f7f7f7f7f7f7f7f);
+	 *
+	 *    mask = ((mask_tmp1 & ~mask_tmp2) >> 7);
+	 */
+.Lcommon_endstring:
+	andn	rWORD1, r8080, rTMP2
+	or	r8080, 1, %o1
+
+	mov	1, %o0
+	sub	rTMP2, %o1, rTMP2
+
+	cmp	rWORD1, rWORD2
+	andn	rTMP1, rTMP2, rTMP1
+
+	movleu	%xcc, -1, %o0
+	srlx	rTMP1, 7, rTMP1
+
+	cmp	rTMP1, rSTRXOR
+	retl
+	 movgu	%xcc, 0, %o0
+
+.Lmaybe_barrel_shift:
+	sub	rSTR2, rSTR1, rSTR2
+	sllx	r8080, 32, rTMP1
+
+	or	r8080, rTMP1, r8080
+	and	rSTR1, 0x7, rTMP2
+
+	srlx	r8080, 7, r0101
+	andn	rSTR1, 0x7, rSTR1
+
+	ldxa	[rSTR1] ASI_PNF, rWORD1
+	andcc	rSTR2, 0x7, rSLL
+	sll	rTMP2, 3, rSTRXOR
+
+	bne,pn	%icc, .Lneed_barrel_shift
+	 mov	-1, rTMP1
+	ldxa	[rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+	srlx	rTMP1, rSTRXOR, rTMP2
+
+	orn	rWORD1, rTMP2, rWORD1
+	ba,pt	%xcc, .Laligned_loop_entry
+	 orn	rBARREL, rTMP2, rWORD2
+
+.Lneed_barrel_shift:
+	sllx	rSLL, 3, rSLL
+	andn	rSTR2, 0x7, rSTR2
+
+	ldxa	[rSTR1 + rSTR2] ASI_PNF, rBARREL
+	mov	64, rTMP2
+	sub	rTMP2, rSLL, rSRL
+
+	srlx	rTMP1, rSTRXOR, rTMP1
+	add	rSTR2, 8, rSTR2
+
+	orn	rWORD1, rTMP1, rWORD1
+	sllx	rBARREL, rSLL, rWORD2
+	ldxa	[rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+	add	rSTR1, 8, rSTR1
+	sub	rWORD1, r0101, rTMP2
+
+	srlx	rBARREL, rSRL, rSTRXOR
+
+	or	rWORD2, rSTRXOR, rWORD2
+
+	orn	rWORD2, rTMP1, rWORD2
+	ba,pt	%xcc, .Lbarrel_shift_loop_entry
+	 andn	r8080, rWORD1, rTMP1
+
+.Lbarrel_shift_loop:
+	sllx	rBARREL, rSLL, rWORD2
+	ldxa	[rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+	add	rSTR1, 8, rSTR1
+	sub	rWORD1, r0101, rTMP2
+
+	srlx	rBARREL, rSRL, rSTRXOR
+	andn	r8080, rWORD1, rTMP1
+
+	or	rWORD2, rSTRXOR, rWORD2
+
+.Lbarrel_shift_loop_entry:
+	xorcc	rWORD1, rWORD2, rSTRXOR
+	bne,pn	%xcc, .Lcommon_endstring
+
+	 andcc	rTMP1, rTMP2, %g0
+	be,a,pt	%xcc, .Lbarrel_shift_loop
+	 ldxa	[rSTR1] ASI_PNF, rWORD1
+
+	retl
+	 mov	0, %o0
 END(strcmp)
 libc_hidden_builtin_def (strcmp)
-- 
1.7.6.401.g6a319


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]