This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
Re: new alpha division routines
- From: Falk Hueffner <hueffner at informatik dot uni-tuebingen dot de>
- To: libc-alpha at sources dot redhat dot com
- Cc: Richard Henderson <rth at redhat dot com>
- Date: Sun, 28 Mar 2004 15:24:31 +0200
- Subject: Re: new alpha division routines
Hi,
incidentally, I was also playing with the division routines lately, or
more precisely __remqu, since it seems to take most time (a whopping
30% of all libc cycles when compiling stuff with gcc). I do a few
things differently, maybe you can use some ideas:
* Special case power-of-two divisor. People do that quite often.
* Special case small dividends (compared to divisor).
* Never loop, use second divt/c if lacking precision. This avoids data
dependent branches and gives good results for the important case of
huge dividends (nearly all 64 bit set) and medium dividends which
occurs in hash tables.
* Handle dividend cvtqt overflow branchless.
I briefly experimented with clz to determine whether a loop would be
useful, but it was only better in very few cases.
Here's a cycle count comparison on ev6 (not quite fair, since you
don't use itof). I've also attached the test program.
By the way, what do you think about inlining the 32-bit ops from gcc?
DEC C does that unconditionally, even on ev4 at -Os, so it is probably
not such a bad idea at least at -O2 on ev6... (I have a patch for that
too somewhere, I think)
your __remqu, not so hot branch prediction:
3 7 1f 7f 80 7f7 7ffff 7ffffd9 ffffffffff
0 58 58 58 58 58 58 58 58 58
f 58 58 58 58 58 58 58 58 58
fe 58 58 58 58 58 58 58 58 58
fed 58 58 58 58 58 58 58 58 58
fedc 58 58 58 58 58 58 58 58 58
fedcba 58 58 58 58 58 58 58 58 58
fedcba98 58 58 58 58 58 58 58 58 58
fedcba9876 58 58 58 58 58 58 58 58 58
fedcba987654 58 58 58 58 58 58 58 58 58
fedcba98765432 129 156 129 129 129 129 129 129 129
3fb72ea61d950c8 175 195 156 129 129 129 129 129 129
fedcba987654321 207 235 207 156 129 129 129 129 129
my __remqu, not so hot branch prediction:
0 38 38 38 38 29 38 38 38 38
f 60 38 38 38 29 38 38 38 38
fe 60 60 60 38 29 38 38 38 38
fed 60 60 60 60 29 38 38 38 38
fedc 60 60 60 60 29 60 38 38 38
fedcba 60 60 60 60 29 60 60 38 38
fedcba98 60 60 60 60 29 60 60 60 38
fedcba9876 60 60 60 60 29 60 60 60 38
fedcba987654 60 60 60 60 29 60 60 60 60
fedcba98765432 60 113 113 60 29 60 60 60 60
3fb72ea61d950c8 113 113 113 60 29 60 60 60 60
fedcba987654321 113 113 113 113 29 60 60 60 60
your __remqu, hot branch prediction:
3 7 1f 7f 80 7f7 7ffff 7ffffd9 ffffffffff
0 45 45 57 59 45 45 45 45 45
f 45 46 45 50 46 50 50 50 50
fe 50 57 59 57 57 57 45 46 57
fed 46 57 46 57 46 57 57 57 57
fedc 46 45 45 45 57 57 57 46 45
fedcba 45 45 45 59 57 57 50 45 46
fedcba98 46 45 59 59 57 57 46 46 46
fedcba9876 45 57 57 46 57 45 46 45 46
fedcba987654 57 57 46 50 46 46 46 46 57
fedcba98765432 117 111 119 119 119 119 119 119 119
3fb72ea61d950c8 124 128 117 119 116 116 117 117 117
fedcba987654321 137 149 137 117 116 119 119 119 119
my __remqu, hot branch prediction:
3 7 1f 7f 80 7f7 7ffff 7ffffd9 ffffffffff
0 12 12 12 12 5 12 11 11 11
f 60 12 12 12 5 12 12 12 12
fe 60 60 60 12 5 12 12 12 12
fed 60 60 60 60 5 12 12 12 12
fedc 60 60 60 60 5 60 12 12 12
fedcba 60 60 60 60 5 60 60 12 11
fedcba98 60 60 60 60 5 60 60 60 12
fedcba9876 60 60 60 60 6 60 60 60 12
fedcba987654 60 60 60 60 5 60 60 60 60
fedcba98765432 60 104 104 60 5 60 60 60 60
3fb72ea61d950c8 104 107 107 60 5 60 60 60 60
fedcba987654321 104 107 107 104 5 60 60 60 60
#define dividend t10
#define divisor t11
#define result t12
#define retaddr t9
#define tmp0 AT
#define tmp1 t0
#define tmp2 t1
#define tmp3 t2
#define tmp4 t3
.ent __remqu
.globl __remqu
.align 3
__remqu:
lda sp, -32(sp)
.frame sp, 32, retaddr
.prologue 0
subq divisor, 1, tmp0
cmplt dividend, 0, result
stt $f0, 0*8(sp)
stt $f1, 1*8(sp)
and divisor, tmp0, tmp0
# If the dividend has the high bit set, we divide it by 2 to
# avoid cvtqt interpreting it as negative.
# Pseudo-negative divisor does not pose a problem, since it will
# always bail out early.
srl dividend, result, result
itoft result, $f0
itoft divisor, $f1
beq tmp0, $powerof2
stq tmp1, 2*8(sp)
mov dividend, result
# Do two iterations of the standard loop while the FPU is busy.
subq dividend, divisor, tmp0
cmpule tmp0, dividend, tmp1
cmovne tmp1, tmp0, result
subq result, divisor, tmp0
cmpule tmp0, result, tmp1
cmovne tmp1, tmp0, result
cvtqt/c $f0, $f0
cvtqt/c $f1, $f1
cmpule divisor, result, tmp1
beq tmp1, $done
divt/c $f0, $f1, $f0
cvttq/c $f0, $f0
cmplt dividend, 0, tmp0
ftoit $f0, tmp1
sll tmp1, tmp0, tmp1
mulq tmp1, divisor, tmp1
subq dividend, tmp1, result
itoft result, $f0
cmpule divisor, result, tmp0
beq tmp0, $done
# The divisor has lost up to 64-51 = 13 bits. Worst case is
# dividing by 3, remains about 12 bits loss. This difference
# can always be handled precisely with a second divt.
cvtqt/c $f0, $f0
divt/c $f0, $f1, $f0
cvttq/c $f0, $f1
ftoit $f1, tmp1
mulq tmp1, divisor, tmp1
subq result, tmp1, result
.align 4
$done: ldq tmp1, 2*8(sp)
ldt $f0, 0*8(sp)
ldt $f1, 1*8(sp)
lda sp, 32(sp)
ret zero, (retaddr), 1
.align 4
$powerof2:
beq divisor, $div_by_zero
subq divisor, 1, tmp0
and dividend, tmp0, result
lda sp, 32(sp)
ret zero, (retaddr), 1
$div_by_zero:
mov a0, tmp0
lda a0, GEN_INTDIV
call_pal PAL_gentrap
mov tmp0, a0
clr result # return 0 when somebody ignores the signal
lda sp, 32(sp)
ret zero, (retaddr), 1
.end __remqu
--
Falk
#include <stdio.h>
#include <stdlib.h>
#ifdef __GNUC__
#define rpcc(x) ({ unsigned long __r; asm volatile("rpcc %0, %1" : "=r"(__r) : "r"(x)) ; __r; })
#else
#include <c_asm.h>
#define rpcc(a) asm ("rpcc %v0, %a0", a)
#endif
typedef unsigned long type;
#define OP %
volatile ulong divisors[] = {
3,
7,
31,
127,
128,
2039,
// 32749,
524287,
// 8388593,
134217689,
// 4294967291,
// 4294967296,
4294967296 * 256l - 1,
};
volatile ulong dividends[] = {
0x0,
0xf,
0xfe,
// 0xfedc >> 6,
// 0xfedc >> 5,
0xfedc >> 4,
// 0xfedc >> 3,
// 0xfedc >> 2,
// 0xfedc >> 1,
0xfedc,
// 0xfedcb,
0xfedcba,
// 0xfedcba9,
0xfedcba98,
// 0xfedcba987,
0xfedcba9876,
// 0xfedcba98765,
0xfedcba987654,
// 0xfedcba9876543,
0xfedcba98765432,
// 0xfedcba987654321 >> 3,
0xfedcba987654321 >> 2,
// 0xfedcba987654321 >> 1,
0xfedcba987654321,
};
#define NUM_DIVISORS (sizeof divisors / sizeof *divisors)
#define NUM_DIVIDENDS (sizeof dividends / sizeof *dividends)
#define N 65536*4
int ulongcompare(const void *p1, const void *p2) {
ulong i = *((ulong *) p1);
ulong j = *((ulong *) p2);
if (i > j)
return 1;
if (i < j)
return -1;
return 0;
}
ulong timings[NUM_DIVIDENDS][NUM_DIVISORS][N];
ulong counter[NUM_DIVIDENDS][NUM_DIVISORS];
int main(void) {
ulong done = 0;
type r;
//int widths[] = { 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, };
int widths[] = { 3, 4, 4, 4, 4, 5, 6, 8, 11, 10, 9, 10, 11, 12, 13, };
ulong i, j, k;
#if 1
for (i = 0; i < NUM_DIVIDENDS; ++i) {
for (j = 0; j < NUM_DIVISORS; ++j){
for (k = 0; k < N; ++k) {
ulong stop, start;
volatile type x = dividends[i];
volatile type y = divisors[j];
volatile type z = x + y;
start = rpcc(z);
r = x OP y;
stop = rpcc(r);
timings[i][j][counter[i][j]++] = stop - start;
}
}
}
#else
while (done < NUM_DIVIDENDS * NUM_DIVISORS) {
ulong i = rand() % NUM_DIVIDENDS;
ulong j = rand() % NUM_DIVISORS;
//ulong x = dividends[i];
//ulong y = divisors[j];
volatile type x = dividends[i];
volatile type y = divisors[j];
type r;
if (counter[i][j] < N) {
ulong stop, start;
volatile type z = x + y;
start = rpcc(z);
r = x OP y;
stop = rpcc(r);
timings[i][j][counter[i][j]++] = stop - start;
if (counter[i][j] == N)
++done;
}
}
#endif
printf(" ");
for (j = 0; j < NUM_DIVISORS; ++j)
printf("%*lx", widths[j], divisors[j]);
printf("\n");
for (i = 0; i < NUM_DIVIDENDS; ++i) {
printf("%16lx ", dividends[i]);
for (j = 0; j < NUM_DIVISORS; ++j) {
qsort(timings[i][j], N, sizeof *(timings[i][j]), ulongcompare);
printf("%*lu", widths[j], timings[i][j][N / 2]);
}
printf("\n");
}
return 0;
}