This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Ping: [PATCH v3] Don't spin around multiplying zeroes in __mul


Ping!

On Wed, Jan 16, 2013 at 06:05:27PM +0530, Siddhesh Poyarekar wrote:
> Hi,
> 
> I managed to improve this patch a bit further.  Attached is an updated
> patch, this time also with comments describing the rationale.  The
> testsuite continues to run clean o x86_64 and the performance numbers
> are as follows:
> 
> Without the patch:
> 
> Total:41379782042, Fastest:402117, Slowest:795883, Avg:413797.820420
> 
> With the patch:
> 
> Total:26096919666, Fastest:256379, Slowest:811423, Avg:260969.196660
> 
> That makes it a 36.9% improvement on average runtime and 36.2%
> improvement on the fastest time.  So that's about 7% better than the
> last patch.
> 
> 
> Siddhesh
> 
> 	* sysdeps/ieee754/dbl-64/mpa.c (__mul): Don't bother with zero
> 	values in the mantissa.
> 
> 
> diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
> index dffa9d8..0555493 100644
> --- a/sysdeps/ieee754/dbl-64/mpa.c
> +++ b/sysdeps/ieee754/dbl-64/mpa.c
> @@ -604,7 +604,7 @@ void
>  SECTION
>  __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
>  {
> -  int i, j, k, k2;
> +  int i, j, k, ip, ip2;
>    double u, zk;
>  
>    /* Is z=0?  */
> @@ -614,11 +614,35 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
>        return;
>      }
>  
> +  /* We need not iterate through all X's and Y's since it's pointless to
> +     multiply zeroes.  Here, both are zero...  */
> +  for (ip2 = p; ip2 > 0; ip2--)
> +    if (X[ip2] != ZERO || Y[ip2] != ZERO)
> +      break;
> +
> +  /* ... and here, at least one of them is still zero.  */
> +  for (ip = ip2; ip > 0; ip--)
> +    if (X[ip] * Y[ip] != ZERO)
> +      break;
> +
>    /* Multiply, add and carry.  */
> -  k2 = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
> -  zk = Z[k2] = ZERO;
> +  k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
> +
> +  /* For X with precision IP and Y with precision IP2, the term X[I]*Y[K-I] is
> +     non-zero only when the ranges of non-zero values overlap.  This happens
> +     only for values of K <= IP + IP2.  Note that we go from 1..K-1, which is
> +     why we come down to IP + IP2 + 1 and not just IP + IP2.  */
> +  while (k > ip + ip2 + 1)
> +    Z[k--] = ZERO;
> +
> +  zk = Z[k] = ZERO;
>  
> -  for (k = k2; k > p; k--)
> +  /* This gives us additional precision if required.  This is only executed
> +     when P < IP1 + IP2 + 1, i.e. at least one of the numbers have precision
> +     of greater than or equal to half of what's required (P).  Anything less
> +     and we're just wasting our time since we'll be spinning around
> +     multiplying zeroes.  */
> +  while (k > p)
>      {
>        for (i = k - p, j = p; i < p + 1; i++, j--)
>  	zk += X[i] * Y[j];
> @@ -626,10 +650,11 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
>        u = (zk + CUTTER) - CUTTER;
>        if (u > zk)
>  	u -= RADIX;
> -      Z[k] = zk - u;
> +      Z[k--] = zk - u;
>        zk = u * RADIXI;
>      }
>  
> +  /* The real deal.  */
>    while (k > 1)
>      {
>        for (i = 1, j = k - 1; i < k; i++, j--)
> @@ -638,9 +663,8 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
>        u = (zk + CUTTER) - CUTTER;
>        if (u > zk)
>  	u -= RADIX;
> -      Z[k] = zk - u;
> +      Z[k--] = zk - u;
>        zk = u * RADIXI;
> -      k--;
>      }
>    Z[k] = zk;
>  


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]