This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] PPC faster sqrt functions


On 64-bit powerpc processors the optional instructions (including fsqrt and fsqrts) are always implemented. So for PPC64 and PPC32 running on PPC64 hardware we should take advantage of the hardware sqrt.

The trick is to use the AT_HWCAP via _dl_hwcap to check for PPC_FEATURE_64. So I redefined the current __ieee754_sqrt[f] implementations as __slow_ieee754_sqrt[f]. Then updated mathinline.h to defined __MATH_INLINE versions of __ieee754_sqrt() and __ieee754_sqrt() which use the following form:


+__MATH_INLINE double +__ieee754_sqrt (double __x) +{ + double __z; + + /* If the CPU is 64-bit we can use the optional FP instructions we. */ + if ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0) + { + /* Volatile is required to prevent the compiler from moving the + fsqrt instruction above the branch. */ + __asm __volatile ( + " fsqrt %0,%1\n" + : "=f" (__z) + : "f" (__x)); + } + else + __z = __slow_ieee754_sqrt(__x); + + return __z; +}

This inline is then used in the wapper functions __sqrt[f]() and internally to other libm functions that use __ieee_sqrt[f](). This the software implementation for PPC32 hardware but leverages the fsqrt instruction on PPC64 hardware. The performance gain is 2.5-3 times.

2004-05-25  Steven Munroe  <sjmunroe@us.ibm.com>

	* sysdeps/powerpc/fpu/Makefile: Make ld.so a dependency of libm.so.
	* sysdeps/powerpc/fpu/bits/mathinline.h [__LIBC_INERNAL_MATH_INLINES]
	(__ieee754_sqrt): Define as __MATH_INLINE using fsqrt instruction.
	(__ieee754_sqrtf): Define as __MATH_INLINE using fsqrts instruction.
	* sysdeps/powerpc/fpu/e_sqrt.c (__slow_ieee754_sqrt): Moved 
	implementation from w_sqrt.c
	* sysdeps/powerpc/fpu/e_sqrtf.c (__slow_ieee754_sqrtf): Moved 
	implementation from w_sqrtf.c
	* sysdeps/powerpc/fpu/w_sqrt.c (__sqrt): Wrapper implementation
	using inline __ieee754_sqrt().	
	* sysdeps/powerpc/fpu/w_sqrtf.c (__sqrtf): Wrapper implementation
	using inline __ieee754_sqrtf().
	* sysdeps/powerpc/powerpc32/sysdep.h [__ASSEMBLER__]: Include
	<sysdeps/powerpc/sysdep.h> independent of __ASSEMBLER__.
	* sysdeps/powerpc/sysdep.h [__ASSEMBLER__] (PPC_FEATURE_*): Define
	PPC_FEATURE_*  independent of __ASSEMBLER__.

diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/Makefile libc23/sysdeps/powerpc/fpu/Makefile
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/Makefile	2002-09-05 04:57:46.000000000 -0500
+++ libc23/sysdeps/powerpc/fpu/Makefile	2004-05-21 15:24:29.458906392 -0500
@@ -1,3 +1,6 @@
 ifeq ($(subdir),math)
 libm-support += fenv_const fe_nomask t_sqrt
+
+# libm needs ld.so to access dl_hwcap
+$(objpfx)libm.so: $(elfobjdir)/ld.so
 endif
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/bits/mathinline.h libc23/sysdeps/powerpc/fpu/bits/mathinline.h
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/bits/mathinline.h	2004-04-01 14:30:12.000000000 -0600
+++ libc23/sysdeps/powerpc/fpu/bits/mathinline.h	2004-05-25 10:07:09.740971360 -0500
@@ -121,4 +121,56 @@
 
 #endif /* __USE_ISOC99 */
 #endif /* !__NO_MATH_INLINES && __OPTIMIZE__ */
+
+/* This code is used internally in the GNU libc.  */
+#  ifdef __LIBC_INTERNAL_MATH_INLINES
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+extern double __slow_ieee754_sqrt (double);
+__MATH_INLINE double
+__ieee754_sqrt (double __x)
+{
+  double __z;
+  
+  /* If the CPU is 64-bit we can use the optional FP instructions we.  */
+  if ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0)
+  {
+    /* Volatile is required to prevent the compiler from moving the 
+       fsqrt instruction above the branch.  */
+     __asm __volatile (
+	"	fsqrt	%0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  }
+  else
+     __z = __slow_ieee754_sqrt(__x);
+     
+  return __z;
+}
+
+extern float __slow_ieee754_sqrtf (float);
+__MATH_INLINE float
+__ieee754_sqrtf (float __x)
+{
+  float __z;
+  
+  /* If the CPU is 64-bit we can use the optional FP instructions we.  */
+  if ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0)
+  {
+    /* Volatile is required to prevent the compiler from moving the 
+       fsqrts instruction above the branch.  */
+     __asm __volatile (
+	"	fsqrts	%0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  }
+  else
+     __z = __slow_ieee754_sqrtf(__x);
+     
+  return __z;
+}
+#  endif /* __LIBC_INTERNAL_MATH_INLINES */
 #endif /* __GNUC__ && !_SOFT_FLOAT */
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/e_sqrt.c libc23/sysdeps/powerpc/fpu/e_sqrt.c
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/e_sqrt.c	1999-10-11 17:29:48.000000000 -0500
+++ libc23/sysdeps/powerpc/fpu/e_sqrt.c	2004-05-25 10:04:01.414982240 -0500
@@ -1 +1,185 @@
-/* __ieee754_sqrt is in w_sqrt.c  */
+/* Double-precision floating point square root.
+   Copyright (C) 1997, 2002, 2003, 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <math.h>
+#include <math_private.h>
+#include <fenv_libc.h>
+#include <inttypes.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+static const double almost_half = 0.5000000000000001;	/* 0.5 + 2^-53 */
+static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
+static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
+static const float two108 = 3.245185536584267269e+32;
+static const float twom54 = 5.551115123125782702e-17;
+extern const float __t_sqrt[1024];
+
+/* The method is based on a description in
+   Computation of elementary functions on the IBM RISC System/6000 processor,
+   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
+   Basically, it consists of two interleaved Newton-Rhapson approximations,
+   one to find the actual square root, and one to find its reciprocal
+   without the expense of a division operation.   The tricky bit here
+   is the use of the POWER/PowerPC multiply-add operation to get the
+   required accuracy with high speed.
+
+   The argument reduction works by a combination of table lookup to
+   obtain the initial guesses, and some careful modification of the
+   generated guesses (which mostly runs on the integer unit, while the
+   Newton-Rhapson is running on the FPU).  */
+
+#ifdef __STDC__
+double
+__slow_ieee754_sqrt (double x)
+#else
+double
+__slow_ieee754_sqrt (x)
+     double x;
+#endif
+{
+  const float inf = a_inf.value;
+
+  if (x > 0)
+    {
+      /* schedule the EXTRACT_WORDS to get separation between the store
+         and the load.  */
+      ieee_double_shape_type ew_u;
+      ieee_double_shape_type iw_u;
+      ew_u.value = (x);
+      if (x != inf)
+	{
+	  /* Variables named starting with 's' exist in the
+	     argument-reduced space, so that 2 > sx >= 0.5,
+	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
+	     Variables named ending with 'i' are integer versions of
+	     floating-point values.  */
+	  double sx;	/* The value of which we're trying to find the
+			   square root.  */
+	  double sg, g;	/* Guess of the square root of x.  */
+	  double sd, d;	/* Difference between the square of the guess and x.  */
+	  double sy;	/* Estimate of 1/2g (overestimated by 1ulp).  */
+	  double sy2;	/* 2*sy */
+	  double e;	/* Difference between y*g and 1/2 (se = e * fsy).  */
+	  double shx;	/* == sx * fsg */
+	  double fsg;	/* sg*fsg == g.  */
+	  fenv_t fe;	/* Saved floating-point environment (stores rounding
+			   mode and whether the inexact exception is
+			   enabled).  */
+	  uint32_t xi0, xi1, sxi, fsgi;
+	  const float *t_sqrt;
+
+	  fe = fegetenv_register ();
+	  /* complete the EXTRACT_WORDS (xi0,xi1,x) operation.  */
+	  xi0 = ew_u.parts.msw;
+	  xi1 = ew_u.parts.lsw;
+	  relax_fenv_state ();
+	  sxi = (xi0 & 0x3fffffff) | 0x3fe00000;
+	  /* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
+	     between the store and the load.  */
+	  iw_u.parts.msw = sxi;
+	  iw_u.parts.lsw = xi1;
+	  t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
+	  sg = t_sqrt[0];
+	  sy = t_sqrt[1];
+	  /* complete the INSERT_WORDS (sx, sxi, xi1) operation.  */
+	  sx = iw_u.value;
+
+	  /* Here we have three Newton-Rhapson iterations each of a
+	     division and a square root and the remainder of the
+	     argument reduction, all interleaved.   */
+	  sd = -(sg * sg - sx);
+	  fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
+	  sy2 = sy + sy;
+	  sg = sy * sd + sg;	/* 16-bit approximation to sqrt(sx). */
+
+	  /* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
+	     between the store and the load.  */
+	  INSERT_WORDS (fsg, fsgi, 0);
+	  iw_u.parts.msw = fsgi;
+	  iw_u.parts.lsw = (0);
+	  e = -(sy * sg - almost_half);
+	  sd = -(sg * sg - sx);
+	  if ((xi0 & 0x7ff00000) == 0)
+	    goto denorm;
+	  sy = sy + e * sy2;
+	  sg = sg + sy * sd;	/* 32-bit approximation to sqrt(sx).  */
+	  sy2 = sy + sy;
+	  /* complete the INSERT_WORDS (fsg, fsgi, 0) operation.  */
+	  fsg = iw_u.value;
+	  e = -(sy * sg - almost_half);
+	  sd = -(sg * sg - sx);
+	  sy = sy + e * sy2;
+	  shx = sx * fsg;
+	  sg = sg + sy * sd;	/* 64-bit approximation to sqrt(sx),
+				   but perhaps rounded incorrectly.  */
+	  sy2 = sy + sy;
+	  g = sg * fsg;
+	  e = -(sy * sg - almost_half);
+	  d = -(g * sg - shx);
+	  sy = sy + e * sy2;
+	  fesetenv_register (fe);
+	  return g + sy * d;
+	denorm:
+	  /* For denormalised numbers, we normalise, calculate the
+	     square root, and return an adjusted result.  */
+	  fesetenv_register (fe);
+	  return __slow_ieee754_sqrt (x * two108) * twom54;
+	}
+    }
+  else if (x < 0)
+    {
+      /* For some reason, some PowerPC32 processors don't implement
+         FE_INVALID_SQRT.  */
+#ifdef FE_INVALID_SQRT
+      feraiseexcept (FE_INVALID_SQRT);
+      if (!fetestexcept (FE_INVALID))
+#endif
+	feraiseexcept (FE_INVALID);
+      x = a_nan.value;
+    }
+  return f_wash (x);
+}
+
+#ifdef __STDC__
+double
+__ieee754_sqrt (double x)
+#else
+double
+__ieee754_sqrt (x)
+     double x;
+#endif
+{
+  double z;
+
+  /* If the CPU is 64-bit we can use the optional FP instructions we.  */
+  if ((GLRO (dl_hwcap) & PPC_FEATURE_64) != 0)
+    {
+      /* Volatile is required to prevent the compiler from moving the 
+         fsqrt instruction above the branch.  */
+      __asm __volatile ("	fsqrt	%0,%1\n"
+				:"=f" (z):"f" (x));
+    }
+  else
+    z = __slow_ieee754_sqrt (x);
+
+  return z;
+}
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/e_sqrtf.c libc23/sysdeps/powerpc/fpu/e_sqrtf.c
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/e_sqrtf.c	1999-10-11 17:29:48.000000000 -0500
+++ libc23/sysdeps/powerpc/fpu/e_sqrtf.c	2004-05-25 10:04:28.790945824 -0500
@@ -1 +1,162 @@
-/* __ieee754_sqrtf is in w_sqrtf.c  */
+/* Single-precision floating point square root.
+   Copyright (C) 1997, 2003, 2004 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <math.h>
+#include <math_private.h>
+#include <fenv_libc.h>
+#include <inttypes.h>
+
+#include <sysdep.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+static const float almost_half = 0.50000006;	/* 0.5 + 2^-24 */
+static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
+static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
+static const float two48 = 281474976710656.0;
+static const float twom24 = 5.9604644775390625e-8;
+extern const float __t_sqrt[1024];
+
+/* The method is based on a description in
+   Computation of elementary functions on the IBM RISC System/6000 processor,
+   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
+   Basically, it consists of two interleaved Newton-Rhapson approximations,
+   one to find the actual square root, and one to find its reciprocal
+   without the expense of a division operation.   The tricky bit here
+   is the use of the POWER/PowerPC multiply-add operation to get the
+   required accuracy with high speed.
+
+   The argument reduction works by a combination of table lookup to
+   obtain the initial guesses, and some careful modification of the
+   generated guesses (which mostly runs on the integer unit, while the
+   Newton-Rhapson is running on the FPU).  */
+
+#ifdef __STDC__
+float
+__slow_ieee754_sqrtf (float x)
+#else
+float
+__slow_ieee754_sqrtf (x)
+     float x;
+#endif
+{
+  const float inf = a_inf.value;
+
+  if (x > 0)
+    {
+      if (x != inf)
+	{
+	  /* Variables named starting with 's' exist in the
+	     argument-reduced space, so that 2 > sx >= 0.5,
+	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
+	     Variables named ending with 'i' are integer versions of
+	     floating-point values.  */
+	  float sx;		/* The value of which we're trying to find the square
+				   root.  */
+	  float sg, g;		/* Guess of the square root of x.  */
+	  float sd, d;		/* Difference between the square of the guess and x.  */
+	  float sy;		/* Estimate of 1/2g (overestimated by 1ulp).  */
+	  float sy2;		/* 2*sy */
+	  float e;		/* Difference between y*g and 1/2 (note that e==se).  */
+	  float shx;		/* == sx * fsg */
+	  float fsg;		/* sg*fsg == g.  */
+	  fenv_t fe;		/* Saved floating-point environment (stores rounding
+				   mode and whether the inexact exception is
+				   enabled).  */
+	  uint32_t xi, sxi, fsgi;
+	  const float *t_sqrt;
+
+	  GET_FLOAT_WORD (xi, x);
+	  fe = fegetenv_register ();
+	  relax_fenv_state ();
+	  sxi = (xi & 0x3fffffff) | 0x3f000000;
+	  SET_FLOAT_WORD (sx, sxi);
+	  t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
+	  sg = t_sqrt[0];
+	  sy = t_sqrt[1];
+
+	  /* Here we have three Newton-Rhapson iterations each of a
+	     division and a square root and the remainder of the
+	     argument reduction, all interleaved.   */
+	  sd = -(sg * sg - sx);
+	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
+	  sy2 = sy + sy;
+	  sg = sy * sd + sg;	/* 16-bit approximation to sqrt(sx). */
+	  e = -(sy * sg - almost_half);
+	  SET_FLOAT_WORD (fsg, fsgi);
+	  sd = -(sg * sg - sx);
+	  sy = sy + e * sy2;
+	  if ((xi & 0x7f800000) == 0)
+	    goto denorm;
+	  shx = sx * fsg;
+	  sg = sg + sy * sd;	/* 32-bit approximation to sqrt(sx),
+				   but perhaps rounded incorrectly.  */
+	  sy2 = sy + sy;
+	  g = sg * fsg;
+	  e = -(sy * sg - almost_half);
+	  d = -(g * sg - shx);
+	  sy = sy + e * sy2;
+	  fesetenv_register (fe);
+	  return g + sy * d;
+	denorm:
+	  /* For denormalised numbers, we normalise, calculate the
+	     square root, and return an adjusted result.  */
+	  fesetenv_register (fe);
+	  return __slow_ieee754_sqrtf (x * two48) * twom24;
+	}
+    }
+  else if (x < 0)
+    {
+      /* For some reason, some PowerPC32 processors don't implement
+         FE_INVALID_SQRT.  */
+#ifdef FE_INVALID_SQRT
+      feraiseexcept (FE_INVALID_SQRT);
+      if (!fetestexcept (FE_INVALID))
+#endif
+	feraiseexcept (FE_INVALID);
+      x = a_nan.value;
+    }
+  return f_washf (x);
+}
+
+
+#ifdef __STDC__
+float
+__ieee754_sqrtf (float x)
+#else
+float
+__ieee754_sqrtf (x)
+     float x;
+#endif
+{
+  double z;
+
+  /* If the CPU is 64-bit we can use the optional FP instructions we.  */
+  if ((GLRO (dl_hwcap) & PPC_FEATURE_64) != 0)
+    {
+      /* Volatile is required to prevent the compiler from moving the 
+         fsqrt instruction above the branch.  */
+      __asm __volatile ("	fsqrts	%0,%1\n"
+				:"=f" (z):"f" (x));
+    }
+  else
+    z = __slow_ieee754_sqrtf (x);
+
+  return z;
+}
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/w_sqrt.c libc23/sysdeps/powerpc/fpu/w_sqrt.c
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/w_sqrt.c	2003-07-31 14:21:01.000000000 -0500
+++ libc23/sysdeps/powerpc/fpu/w_sqrt.c	2004-05-24 13:32:19.934010752 -0500
@@ -1,5 +1,5 @@
-/* Double-precision floating point square root.
-   Copyright (C) 1997, 2002, 2003 Free Software Foundation, Inc.
+/* Double-precision floating point square root wrapper.
+   Copyright (C) 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,130 +17,35 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#include <math.h>
-#include <math_private.h>
+#include "math.h"
+#include "math_private.h"
 #include <fenv_libc.h>
-#include <inttypes.h>
 
-static const double almost_half = 0.5000000000000001;  /* 0.5 + 2^-53 */
-static const ieee_float_shape_type a_nan = { .word = 0x7fc00000 };
-static const ieee_float_shape_type a_inf = { .word = 0x7f800000 };
-static const float two108 = 3.245185536584267269e+32;
-static const float twom54 = 5.551115123125782702e-17;
-extern const float __t_sqrt[1024];
-
-/* The method is based on a description in
-   Computation of elementary functions on the IBM RISC System/6000 processor,
-   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
-   Basically, it consists of two interleaved Newton-Rhapson approximations,
-   one to find the actual square root, and one to find its reciprocal
-   without the expense of a division operation.   The tricky bit here
-   is the use of the POWER/PowerPC multiply-add operation to get the
-   required accuracy with high speed.
-
-   The argument reduction works by a combination of table lookup to
-   obtain the initial guesses, and some careful modification of the
-   generated guesses (which mostly runs on the integer unit, while the
-   Newton-Rhapson is running on the FPU).  */
+#ifdef __STDC__
 double
-__sqrt(double x)
-{
-  const float inf = a_inf.value;
-  /* x = f_wash(x); *//* This ensures only one exception for SNaN. */
-  if (x > 0)
-    {
-      if (x != inf)
-	{
-	  /* Variables named starting with 's' exist in the
-	     argument-reduced space, so that 2 > sx >= 0.5,
-	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
-	     Variables named ending with 'i' are integer versions of
-	     floating-point values.  */
-	  double sx;   /* The value of which we're trying to find the
-			  square root.  */
-	  double sg,g; /* Guess of the square root of x.  */
-	  double sd,d; /* Difference between the square of the guess and x.  */
-	  double sy;   /* Estimate of 1/2g (overestimated by 1ulp).  */
-	  double sy2;  /* 2*sy */
-	  double e;    /* Difference between y*g and 1/2 (se = e * fsy).  */
-	  double shx;  /* == sx * fsg */
-	  double fsg;  /* sg*fsg == g.  */
-	  fenv_t fe;  /* Saved floating-point environment (stores rounding
-			 mode and whether the inexact exception is
-			 enabled).  */
-	  uint32_t xi0, xi1, sxi, fsgi;
-	  const float *t_sqrt;
-
-	  fe = fegetenv_register();
-	  EXTRACT_WORDS (xi0,xi1,x);
-	  relax_fenv_state();
-	  sxi = (xi0 & 0x3fffffff) | 0x3fe00000;
-	  INSERT_WORDS (sx, sxi, xi1);
-	  t_sqrt = __t_sqrt + (xi0 >> (52-32-8-1)  & 0x3fe);
-	  sg = t_sqrt[0];
-	  sy = t_sqrt[1];
-
-	  /* Here we have three Newton-Rhapson iterations each of a
-	     division and a square root and the remainder of the
-	     argument reduction, all interleaved.   */
-	  sd  = -(sg*sg - sx);
-	  fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
-	  sy2 = sy + sy;
-	  sg  = sy*sd + sg;  /* 16-bit approximation to sqrt(sx). */
-	  INSERT_WORDS (fsg, fsgi, 0);
-	  e   = -(sy*sg - almost_half);
-	  sd  = -(sg*sg - sx);
-	  if ((xi0 & 0x7ff00000) == 0)
-	    goto denorm;
-	  sy  = sy + e*sy2;
-	  sg  = sg + sy*sd;  /* 32-bit approximation to sqrt(sx).  */
-	  sy2 = sy + sy;
-	  e   = -(sy*sg - almost_half);
-	  sd  = -(sg*sg - sx);
-	  sy  = sy + e*sy2;
-	  shx = sx * fsg;
-	  sg  = sg + sy*sd;  /* 64-bit approximation to sqrt(sx),
-				but perhaps rounded incorrectly.  */
-	  sy2 = sy + sy;
-	  g   = sg * fsg;
-	  e   = -(sy*sg - almost_half);
-	  d   = -(g*sg - shx);
-	  sy  = sy + e*sy2;
-	  fesetenv_register (fe);
-	  return g + sy*d;
-	denorm:
-	  /* For denormalised numbers, we normalise, calculate the
-	     square root, and return an adjusted result.  */
-	  fesetenv_register (fe);
-	  return __sqrt(x * two108) * twom54;
-	}
-    }
-  else if (x < 0)
-    {
-#ifdef FE_INVALID_SQRT
-      feraiseexcept (FE_INVALID_SQRT);
-      /* For some reason, some PowerPC processors don't implement
-	 FE_INVALID_SQRT.  I guess no-one ever thought they'd be
-	 used for square roots... :-) */
-      if (!fetestexcept (FE_INVALID))
+__sqrt (double x)		/* wrapper sqrt */
+#else
+double
+__sqrt (x)			/* wrapper sqrt */
+     double x;
 #endif
-	feraiseexcept (FE_INVALID);
-#ifndef _IEEE_LIBM
-      if (_LIB_VERSION != _IEEE_)
-	x = __kernel_standard(x,x,26);
-      else
+{
+#ifdef _IEEE_LIBM
+  return __ieee754_sqrt (x);
+#else
+  double z;
+  z = __ieee754_sqrt (x);
+  if (_LIB_VERSION == _IEEE_ || (x != x))
+    return z;
+
+  if (x < 0.0)
+    return __kernel_standard (x, x, 26);	/* sqrt(negative) */
+  else
+    return z;
 #endif
-      x = a_nan.value;
-    }
-  return f_wash(x);
 }
 
 weak_alias (__sqrt, sqrt)
-/* Strictly, this is wrong, but the only places where _ieee754_sqrt is
-   used will not pass in a negative result.  */
-strong_alias(__sqrt,__ieee754_sqrt)
-
 #ifdef NO_LONG_DOUBLE
-weak_alias (__sqrt, __sqrtl)
-weak_alias (__sqrt, sqrtl)
+  strong_alias (__sqrt, __sqrtl) weak_alias (__sqrt, sqrtl)
 #endif
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/fpu/w_sqrtf.c libc23/sysdeps/powerpc/fpu/w_sqrtf.c
--- libc23-cvstip-20040520/sysdeps/powerpc/fpu/w_sqrtf.c	2003-07-31 14:21:22.000000000 -0500
+++ libc23/sysdeps/powerpc/fpu/w_sqrtf.c	2004-05-24 13:32:22.441029840 -0500
@@ -1,5 +1,5 @@
-/* Single-precision floating point square root.
-   Copyright (C) 1997, 2003 Free Software Foundation, Inc.
+/* Single-precision floating point square root wrapper.
+   Copyright (C) 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,120 +17,38 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#include <math.h>
-#include <math_private.h>
+#include "math.h"
+#include "math_private.h"
 #include <fenv_libc.h>
-#include <inttypes.h>
 
-static const float almost_half = 0.50000006;  /* 0.5 + 2^-24 */
-static const ieee_float_shape_type a_nan = { .word = 0x7fc00000 };
-static const ieee_float_shape_type a_inf = { .word = 0x7f800000 };
-static const float two48 = 281474976710656.0;
-static const float twom24 = 5.9604644775390625e-8;
-extern const float __t_sqrt[1024];
-
-/* The method is based on a description in
-   Computation of elementary functions on the IBM RISC System/6000 processor,
-   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
-   Basically, it consists of two interleaved Newton-Rhapson approximations,
-   one to find the actual square root, and one to find its reciprocal
-   without the expense of a division operation.   The tricky bit here
-   is the use of the POWER/PowerPC multiply-add operation to get the
-   required accuracy with high speed.
-
-   The argument reduction works by a combination of table lookup to
-   obtain the initial guesses, and some careful modification of the
-   generated guesses (which mostly runs on the integer unit, while the
-   Newton-Rhapson is running on the FPU).  */
+#include <sysdep.h>
+#include <ldsodefs.h>
+#include <dl-procinfo.h>
+
+#ifdef __STDC__
 float
-__sqrtf(float x)
-{
-  const float inf = a_inf.value;
-  /* x = f_washf(x); *//* This ensures only one exception for SNaN. */
-  if (x > 0)
-    {
-      if (x != inf)
-	{
-	  /* Variables named starting with 's' exist in the
-	     argument-reduced space, so that 2 > sx >= 0.5,
-	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
-	     Variables named ending with 'i' are integer versions of
-	     floating-point values.  */
-	  float sx;   /* The value of which we're trying to find the square
-			 root.  */
-	  float sg,g; /* Guess of the square root of x.  */
-	  float sd,d; /* Difference between the square of the guess and x.  */
-	  float sy;   /* Estimate of 1/2g (overestimated by 1ulp).  */
-	  float sy2;  /* 2*sy */
-	  float e;    /* Difference between y*g and 1/2 (note that e==se).  */
-	  float shx;  /* == sx * fsg */
-	  float fsg;  /* sg*fsg == g.  */
-	  fenv_t fe;  /* Saved floating-point environment (stores rounding
-			 mode and whether the inexact exception is
-			 enabled).  */
-	  uint32_t xi, sxi, fsgi;
-	  const float *t_sqrt;
-
-	  GET_FLOAT_WORD (xi, x);
-	  fe = fegetenv_register ();
-	  relax_fenv_state ();
-	  sxi = (xi & 0x3fffffff) | 0x3f000000;
-	  SET_FLOAT_WORD (sx, sxi);
-	  t_sqrt = __t_sqrt + (xi >> (23-8-1)  & 0x3fe);
-	  sg = t_sqrt[0];
-	  sy = t_sqrt[1];
-	  
-	  /* Here we have three Newton-Rhapson iterations each of a
-	     division and a square root and the remainder of the
-	     argument reduction, all interleaved.   */
-	  sd  = -(sg*sg - sx);
-	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
-	  sy2 = sy + sy;
-	  sg  = sy*sd + sg;  /* 16-bit approximation to sqrt(sx). */
-	  e   = -(sy*sg - almost_half);
-	  SET_FLOAT_WORD (fsg, fsgi);
-	  sd  = -(sg*sg - sx);
-	  sy  = sy + e*sy2;
-	  if ((xi & 0x7f800000) == 0)
-	    goto denorm;
-	  shx = sx * fsg;
-	  sg  = sg + sy*sd;  /* 32-bit approximation to sqrt(sx),
-				but perhaps rounded incorrectly.  */
-	  sy2 = sy + sy;
-	  g   = sg * fsg;
-	  e   = -(sy*sg - almost_half);
-	  d   = -(g*sg - shx);
-	  sy  = sy + e*sy2;
-	  fesetenv_register (fe);
-	  return g + sy*d;
-	denorm:
-	  /* For denormalised numbers, we normalise, calculate the
-	     square root, and return an adjusted result.  */
-	  fesetenv_register (fe);
-	  return __sqrtf(x * two48) * twom24;
-	}
-    }
-  else if (x < 0)
-    {
-#ifdef FE_INVALID_SQRT
-      feraiseexcept (FE_INVALID_SQRT);
-      /* For some reason, some PowerPC processors don't implement
-	 FE_INVALID_SQRT.  I guess no-one ever thought they'd be
-	 used for square roots... :-) */
-      if (!fetestexcept (FE_INVALID))
+__sqrtf (float x)		/* wrapper sqrtf */
+#else
+float
+__sqrtf (x)			/* wrapper sqrtf */
+     float x;
 #endif
-	feraiseexcept (FE_INVALID);
-#ifndef _IEEE_LIBM
-      if (_LIB_VERSION != _IEEE_)
-	x = __kernel_standard(x,x,126);
-      else
+{
+#ifdef _IEEE_LIBM
+  return __ieee754_sqrtf (x);
+#else
+  float z;
+  z = __ieee754_sqrtf (x);
+
+  if (_LIB_VERSION == _IEEE_ || (x != x))
+    return z;
+
+  if (x < (float) 0.0)
+    /* sqrtf(negative) */
+    return (float) __kernel_standard ((double) x, (double) x, 126);
+  else
+    return z;
 #endif
-      x = a_nan.value;
-    }
-  return f_washf(x);
 }
 
 weak_alias (__sqrtf, sqrtf)
-/* Strictly, this is wrong, but the only places where _ieee754_sqrt is
-   used will not pass in a negative result.  */
-strong_alias(__sqrtf,__ieee754_sqrtf)
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/powerpc32/sysdep.h libc23/sysdeps/powerpc/powerpc32/sysdep.h
--- libc23-cvstip-20040520/sysdeps/powerpc/powerpc32/sysdep.h	2003-08-16 19:33:13.000000000 -0500
+++ libc23/sysdeps/powerpc/powerpc32/sysdep.h	2004-05-21 15:24:29.470904568 -0500
@@ -17,10 +17,10 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#ifdef __ASSEMBLER__
-
 #include <sysdeps/powerpc/sysdep.h>
 
+#ifdef __ASSEMBLER__
+
 #ifdef __ELF__
 
 /* If compiled for profiling, call `_mcount' at the start of each
diff -urN libc23-cvstip-20040520/sysdeps/powerpc/sysdep.h libc23/sysdeps/powerpc/sysdep.h
--- libc23-cvstip-20040520/sysdeps/powerpc/sysdep.h	2004-01-15 22:43:07.000000000 -0600
+++ libc23/sysdeps/powerpc/sysdep.h	2004-05-21 15:24:29.472904264 -0500
@@ -16,6 +16,20 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+/* 
+ * Powerpc Feature masks for the Aux Vector Hardware Capabilities (AT_HWCAP). 
+ * This entry is copied to _dl_hwcap or rtld_global._dl_hwcap during startup.
+ * The following must match the kernels linux/asm/cputable.h.  
+ */
+#define PPC_FEATURE_32			0x80000000 /* 32-bit mode. */
+#define PPC_FEATURE_64			0x40000000 /* 64-bit mode. */
+#define PPC_FEATURE_601_INSTR		0x20000000 /* 601 chip, Old POWER ISA.  */
+#define PPC_FEATURE_HAS_ALTIVEC		0x10000000 /* SIMD/Vector Unit.  */
+#define PPC_FEATURE_HAS_FPU		0x08000000 /* Floating Point Unit.  */
+#define PPC_FEATURE_HAS_MMU		0x04000000 /* Memory Management Unit.  */
+#define PPC_FEATURE_HAS_4xxMAC		0x02000000 /* 4xx Multiply Accumulator.  */
+#define PPC_FEATURE_UNIFIED_CACHE	0x01000000 /* Unified I/D cache.  */
+
 #ifdef __ASSEMBLER__
 
 /* Symbolic names for the registers.  The only portable way to write asm
@@ -146,19 +160,5 @@
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name
 
 #endif /* __ELF__ */
-
-/* 
- * Powerpc Feature masks for the Aux Vector Hardware Capabilities (AT_HWCAP). 
- * This entry is copied to _dl_hwcap or rtld_global._dl_hwcap during startup.
- * The following must match the kernels linux/asm/cputable.h.  
- */
-#define PPC_FEATURE_32			0x80000000 /* 32-bit mode. */
-#define PPC_FEATURE_64			0x40000000 /* 64-bit mode. */
-#define PPC_FEATURE_601_INSTR		0x20000000 /* 601 chip, Old POWER ISA.  */
-#define PPC_FEATURE_HAS_ALTIVEC		0x10000000 /* SIMD/Vector Unit.  */
-#define PPC_FEATURE_HAS_FPU		0x08000000 /* Floating Point Unit.  */
-#define PPC_FEATURE_HAS_MMU		0x04000000 /* Memory Management Unit.  */
-#define PPC_FEATURE_HAS_4xxMAC		0x02000000 /* 4xx Multiply Accumulator.  */
-#define PPC_FEATURE_UNIFIED_CACHE	0x01000000 /* Unified I/D cache.  */
-
 #endif	/* __ASSEMBLER__ */
+

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]