This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: new alpha division routines


Hi,

incidentally, I was also playing with the division routines lately, or
more precisely __remqu, since it seems to take most time (a whopping
30% of all libc cycles when compiling stuff with gcc). I do a few
things differently, maybe you can use some ideas:

* Special case power-of-two divisor. People do that quite often.
* Special case small dividends (compared to divisor).
* Never loop, use second divt/c if lacking precision. This avoids data
  dependent branches and gives good results for the important case of
  huge dividends (nearly all 64 bit set) and medium dividends which
  occurs in hash tables.
* Handle dividend cvtqt overflow branchless.

I briefly experimented with clz to determine whether a loop would be
useful, but it was only better in very few cases.

Here's a cycle count comparison on ev6 (not quite fair, since you
don't use itof). I've also attached the test program.

By the way, what do you think about inlining the 32-bit ops from gcc?
DEC C does that unconditionally, even on ev4 at -Os, so it is probably
not such a bad idea at least at -O2 on ev6... (I have a patch for that
too somewhere, I think)

your __remqu, not so hot branch prediction:

                   3   7  1f  7f  80  7f7 7ffff 7ffffd9 ffffffffff
               0  58  58  58  58  58   58    58      58         58
               f  58  58  58  58  58   58    58      58         58
              fe  58  58  58  58  58   58    58      58         58
             fed  58  58  58  58  58   58    58      58         58
            fedc  58  58  58  58  58   58    58      58         58
          fedcba  58  58  58  58  58   58    58      58         58
        fedcba98  58  58  58  58  58   58    58      58         58
      fedcba9876  58  58  58  58  58   58    58      58         58
    fedcba987654  58  58  58  58  58   58    58      58         58
  fedcba98765432 129 156 129 129 129  129   129     129        129
 3fb72ea61d950c8 175 195 156 129 129  129   129     129        129
 fedcba987654321 207 235 207 156 129  129   129     129        129


my __remqu, not so hot branch prediction:

               0  38  38  38  38  29   38    38      38         38
               f  60  38  38  38  29   38    38      38         38
              fe  60  60  60  38  29   38    38      38         38
             fed  60  60  60  60  29   38    38      38         38
            fedc  60  60  60  60  29   60    38      38         38
          fedcba  60  60  60  60  29   60    60      38         38
        fedcba98  60  60  60  60  29   60    60      60         38
      fedcba9876  60  60  60  60  29   60    60      60         38
    fedcba987654  60  60  60  60  29   60    60      60         60
  fedcba98765432  60 113 113  60  29   60    60      60         60
 3fb72ea61d950c8 113 113 113  60  29   60    60      60         60
 fedcba987654321 113 113 113 113  29   60    60      60         60

your __remqu, hot branch prediction:
                   3   7  1f  7f  80  7f7 7ffff 7ffffd9 ffffffffff
               0  45  45  57  59  45   45    45      45         45
               f  45  46  45  50  46   50    50      50         50
              fe  50  57  59  57  57   57    45      46         57
             fed  46  57  46  57  46   57    57      57         57
            fedc  46  45  45  45  57   57    57      46         45
          fedcba  45  45  45  59  57   57    50      45         46
        fedcba98  46  45  59  59  57   57    46      46         46
      fedcba9876  45  57  57  46  57   45    46      45         46
    fedcba987654  57  57  46  50  46   46    46      46         57
  fedcba98765432 117 111 119 119 119  119   119     119        119
 3fb72ea61d950c8 124 128 117 119 116  116   117     117        117
 fedcba987654321 137 149 137 117 116  119   119     119        119

my __remqu, hot branch prediction:

                   3   7  1f  7f  80  7f7 7ffff 7ffffd9 ffffffffff
               0  12  12  12  12   5   12    11      11         11
               f  60  12  12  12   5   12    12      12         12
              fe  60  60  60  12   5   12    12      12         12
             fed  60  60  60  60   5   12    12      12         12
            fedc  60  60  60  60   5   60    12      12         12
          fedcba  60  60  60  60   5   60    60      12         11
        fedcba98  60  60  60  60   5   60    60      60         12
      fedcba9876  60  60  60  60   6   60    60      60         12
    fedcba987654  60  60  60  60   5   60    60      60         60
  fedcba98765432  60 104 104  60   5   60    60      60         60
 3fb72ea61d950c8 104 107 107  60   5   60    60      60         60
 fedcba987654321 104 107 107 104   5   60    60      60         60



#define dividend		t10
#define divisor			t11
#define result			t12
#define retaddr			t9

#define tmp0	AT
#define tmp1	t0
#define tmp2	t1
#define tmp3	t2
#define tmp4	t3

	
	.ent __remqu
	.globl __remqu

	.align 3
__remqu:
	lda	sp, -32(sp)
	.frame	sp, 32, retaddr
	.prologue 0
	subq	divisor, 1, tmp0
	cmplt	dividend, 0, result
	stt	$f0, 0*8(sp)

	stt	$f1, 1*8(sp)
	and	divisor, tmp0, tmp0
	# If the dividend has the high bit set, we divide it by 2 to
	# avoid cvtqt interpreting it as negative.
	# Pseudo-negative divisor does not pose a problem, since it will 
	# always bail out early.
	srl	dividend, result, result
	itoft	result, $f0

	itoft	divisor, $f1
	beq	tmp0, $powerof2	
	stq	tmp1, 2*8(sp)
	mov	dividend, result

	# Do two iterations of the standard loop while the FPU is busy.
	subq	dividend, divisor, tmp0
	cmpule	tmp0, dividend, tmp1
	cmovne	tmp1, tmp0, result
	subq	result, divisor, tmp0

	cmpule	tmp0, result, tmp1
	cmovne	tmp1, tmp0, result
	cvtqt/c	$f0, $f0
	cvtqt/c	$f1, $f1

	cmpule	divisor, result, tmp1
	beq	tmp1, $done
	divt/c	$f0, $f1, $f0
	cvttq/c	$f0, $f0

	cmplt   dividend, 0, tmp0
	ftoit	$f0, tmp1
	sll	tmp1, tmp0, tmp1
	mulq	tmp1, divisor, tmp1

	subq	dividend, tmp1, result
	itoft	result, $f0
	cmpule	divisor, result, tmp0
	beq	tmp0, $done
	# The divisor has lost up to 64-51 = 13 bits. Worst case is 
	# dividing by 3, remains about 12 bits loss. This difference
	# can always be handled precisely with a second divt.
	cvtqt/c	$f0, $f0
	divt/c	$f0, $f1, $f0
	cvttq/c	$f0, $f1
	ftoit	$f1, tmp1

	mulq	tmp1, divisor, tmp1
	subq	result, tmp1, result
	
	.align 4
$done:	ldq	tmp1, 2*8(sp)
	ldt	$f0,  0*8(sp)
	ldt	$f1,  1*8(sp)
	lda	sp, 32(sp)

	ret	zero, (retaddr), 1

	.align 4
$powerof2:
	beq	divisor, $div_by_zero
	subq	divisor, 1, tmp0
	and	dividend, tmp0, result
	lda	sp, 32(sp)

	ret	zero, (retaddr), 1

$div_by_zero:
	mov	a0, tmp0
	lda	a0, GEN_INTDIV
	call_pal PAL_gentrap
	mov	tmp0, a0
	clr	result		# return 0 when somebody ignores the signal
	lda	sp, 32(sp)
	ret	zero, (retaddr), 1
		
	.end	__remqu


-- 
	Falk
#include <stdio.h>
#include <stdlib.h>

#ifdef __GNUC__
#define rpcc(x) ({ unsigned long __r; asm volatile("rpcc %0, %1" : "=r"(__r) : "r"(x)) ; __r; })
#else
#include <c_asm.h>
#define rpcc(a) asm ("rpcc %v0, %a0", a)
#endif

typedef unsigned long type;
#define OP %

volatile ulong divisors[] = {
    3,
    7,
    31,
    127,
    128,
    2039,
//    32749,
    524287,
//    8388593,
    134217689,
//    4294967291,
//    4294967296,
    4294967296 * 256l - 1,
};

volatile ulong dividends[] = {
    0x0,
    0xf,
    0xfe,
//    0xfedc >> 6,
//    0xfedc >> 5,
    0xfedc >> 4,
//    0xfedc >> 3,
//    0xfedc >> 2,
//    0xfedc >> 1,
    0xfedc,
//    0xfedcb,
    0xfedcba,
//    0xfedcba9,
    0xfedcba98,
//    0xfedcba987,
    0xfedcba9876,
//    0xfedcba98765,
    0xfedcba987654,
//    0xfedcba9876543,
    0xfedcba98765432,
//    0xfedcba987654321 >> 3,
    0xfedcba987654321 >> 2,
//    0xfedcba987654321 >> 1,
    0xfedcba987654321,
};

#define NUM_DIVISORS (sizeof divisors / sizeof *divisors)
#define NUM_DIVIDENDS (sizeof dividends / sizeof *dividends)
#define N 65536*4

int ulongcompare(const void *p1, const void *p2) {
    ulong i = *((ulong *) p1);
    ulong j = *((ulong *) p2);

    if (i > j)
	return 1;
    if (i < j)
	return -1;
    return 0;
}

ulong timings[NUM_DIVIDENDS][NUM_DIVISORS][N];
ulong counter[NUM_DIVIDENDS][NUM_DIVISORS];

int main(void) {
    ulong done = 0;
    type r;

    //int widths[] = { 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,  };
    int widths[] = { 3, 4, 4, 4, 4, 5, 6, 8, 11, 10, 9, 10, 11, 12, 13,  };

    ulong i, j, k;
#if 1
    for (i = 0; i < NUM_DIVIDENDS; ++i) {
	for (j = 0; j < NUM_DIVISORS; ++j){
	    for (k = 0; k < N; ++k) {
		ulong stop, start;
		volatile type x = dividends[i];
		volatile type y = divisors[j];
		volatile type z = x + y;
		start = rpcc(z);
		r = x OP y;
		stop = rpcc(r);
		timings[i][j][counter[i][j]++] = stop - start;
	    }
	}
    }
#else
    while (done < NUM_DIVIDENDS * NUM_DIVISORS) {
	ulong i = rand() % NUM_DIVIDENDS;
	ulong j = rand() % NUM_DIVISORS;
	//ulong x = dividends[i];
	//ulong y = divisors[j];
	volatile type x = dividends[i];
	volatile type y = divisors[j];
	type r;

	if (counter[i][j] < N) {
	    ulong stop, start;
	    volatile type z = x + y;
	    start = rpcc(z);
	    r = x OP y;
	    stop = rpcc(r);
	    timings[i][j][counter[i][j]++] = stop - start;
	    if (counter[i][j] == N)
		++done;
	}
    }
#endif

    printf("                 ");
    for (j = 0; j < NUM_DIVISORS; ++j)
	printf("%*lx", widths[j], divisors[j]);
	printf("\n");
    
    for (i = 0; i < NUM_DIVIDENDS; ++i) {
	printf("%16lx ", dividends[i]);
	
	for (j = 0; j < NUM_DIVISORS; ++j) {
	    qsort(timings[i][j], N, sizeof *(timings[i][j]), ulongcompare);
	    printf("%*lu", widths[j], timings[i][j][N / 2]);
	}
	printf("\n");
    }

    return 0;
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]