This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Faster x^(2^m) for mpexp using __sqr
- From: Siddhesh Poyarekar <siddhesh at redhat dot com>
- To: libc-alpha at sourceware dot org
- Date: Tue, 12 Feb 2013 13:10:32 +0530
- Subject: [PATCH] Faster x^(2^m) for mpexp using __sqr
Hi,
This patch speeds up the third part of mpexp logic, which is raising
the range-reduced result to the power of 2^m. This is currently done
with __mul, which is not optimal since the mantissa calculation is
simpler when X == Y. Hence I implemented a new function __sqr which
is a special case of __mul. There were no regressions reported in the
testsuite on x86_64.
Performance:
I've used the same program and inputs as in my previous patch that
improves mpexp[1]. Also, the numbers are on top of the patch in [1].
Powerpc64 without patch:
Total:3995589335, Fastest:398579, Slowest:542930, Avg:399558.933500
Powerpc64 with patch:
Total:3090824209, Fastest:308145, Slowest:404187, Avg:309082.420900
That is an improvement of 22.64% in the average case and 22.69% in the
best case.
x86_64 without patch:
Total:2123489281, Fastest:203207, Slowest:805219, Avg:212348.928100
x86_64 with patch:
Total:1584336202, Fastest:154559, Slowest:613878, Avg:158433.620200
That makes it an improvement of 25.39% in the average case and 23.94%
in the best case.
OK to commit?
Siddhesh
[1] http://sourceware.org/ml/libc-alpha/2013-02/msg00198.html
* sysdeps/ieee754/dbl-64/mpa.c (__sqr): New function.
* sysdeps/ieee754/dbl-64/mpa.h (__sqr): Declare.
* sysdeps/ieee754/dbl-64/mpexp.c (__mpexp): use __sqr instead
of __mul for squares.
* sysdeps/powerpc/powerpc32/power4/fpu/mpa.c (__sqr): New
function
* sysdeps/powerpc/powerpc64/power4/fpu/mpa.c (__sqr):
Likewise.
* sysdeps/x86_64/fpu/multiarch/mpa-avx.c: Define __sqr.
* sysdeps/x86_64/fpu/multiarch/mpa-fma4.c: Likewise.
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index ede8ed1..30272cb 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -654,6 +654,100 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
Z[0] = X[0] * Y[0];
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+SECTION
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/ieee754/dbl-64/mpa.h b/sysdeps/ieee754/dbl-64/mpa.h
index 06343d4..168b334 100644
--- a/sysdeps/ieee754/dbl-64/mpa.h
+++ b/sysdeps/ieee754/dbl-64/mpa.h
@@ -115,6 +115,7 @@ void __dbl_mp (double, mp_no *, int);
void __add (const mp_no *, const mp_no *, mp_no *, int);
void __sub (const mp_no *, const mp_no *, mp_no *, int);
void __mul (const mp_no *, const mp_no *, mp_no *, int);
+void __sqr (const mp_no *, mp_no *, int);
void __dvd (const mp_no *, const mp_no *, mp_no *, int);
extern void __mpatan (mp_no *, mp_no *, int);
diff --git a/sysdeps/ieee754/dbl-64/mpexp.c b/sysdeps/ieee754/dbl-64/mpexp.c
index 35c4e80..2db4536 100644
--- a/sysdeps/ieee754/dbl-64/mpexp.c
+++ b/sysdeps/ieee754/dbl-64/mpexp.c
@@ -152,14 +152,14 @@ __mpexp (mp_no *x, mp_no *y, int p)
/* Raise polynomial value to the power of 2**m. Put result in y. */
for (k = 0, j = 0; k < m;)
{
- __mul (&mpt2, &mpt2, &mpt1, p);
+ __sqr (&mpt2, &mpt1, p);
k++;
if (k == m)
{
j = 1;
break;
}
- __mul (&mpt1, &mpt1, &mpt2, p);
+ __sqr (&mpt1, &mpt2, p);
k++;
}
if (j)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
index b1784f2..0de80f6 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c
@@ -687,6 +687,99 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
return;
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
index b1784f2..0de80f6 100644
--- a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
+++ b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c
@@ -687,6 +687,99 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
return;
}
+/* Square *X and store result in *Y. X and Y may not overlap. For P in
+ [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
+ error is bounded by 1.001 ULP. This is a faster special case of
+ multiplication. */
+void
+__sqr (const mp_no *x, mp_no *y, int p)
+{
+ long i, j, k, ip;
+ double u, yk;
+
+ /* Is z=0? */
+ if (__glibc_unlikely (X[0] == ZERO))
+ {
+ Y[0] = ZERO;
+ return;
+ }
+
+ /* We need not iterate through all X's since it's pointless to
+ multiply zeroes. */
+ for (ip = p; ip > 0; ip--)
+ if (X[ip] != ZERO)
+ break;
+
+ k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+ while (k > 2 * ip + 1)
+ Y[k--] = ZERO;
+
+ yk = ZERO;
+
+ while (k > p)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = k - p, j = p; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ if (i == j)
+ yk += X[i] * X[i];
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+
+ while (k > 1)
+ {
+ double yk2 = 0.0;
+ long lim = k / 2;
+
+ if (k % 2 == 0)
+ {
+ yk += X[lim] * X[lim];
+ lim--;
+ }
+
+ for (i = 1, j = k - 1; i <= lim; i++, j--)
+ yk2 += X[i] * X[j];
+
+ yk += 2.0 * yk2;
+
+ u = (yk + CUTTER) - CUTTER;
+ if (u > yk)
+ u -= RADIX;
+ Y[k--] = yk - u;
+ yk = u * RADIXI;
+ }
+ Y[k] = yk;
+
+ /* Squares are always positive. */
+ Y[0] = 1.0;
+
+ EY = 2 * EX;
+ /* Is there a carry beyond the most significant digit? */
+ if (__glibc_unlikely (Y[1] == ZERO))
+ {
+ for (i = 1; i <= p; i++)
+ Y[i] = Y[i + 1];
+ EY--;
+ }
+}
+
/* Invert *X and store in *Y. Relative error bound:
- For P = 2: 1.001 * R ^ (1 - P)
- For P = 3: 1.063 * R ^ (1 - P)
diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
index d3f4d7a..366b0b7 100644
--- a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
+++ b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c
@@ -1,5 +1,6 @@
#define __add __add_avx
#define __mul __mul_avx
+#define __sqr __sqr_avx
#define __sub __sub_avx
#define __dbl_mp __dbl_mp_avx
#define __dvd __dvd_avx
diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
index 6abb671..a4a7594 100644
--- a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
+++ b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c
@@ -1,5 +1,6 @@
#define __add __add_fma4
#define __mul __mul_fma4
+#define __sqr __sqr_fma4
#define __sub __sub_fma4
#define __dbl_mp __dbl_mp_fma4
#define __dvd __dvd_fma4