This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

new alpha division routines


The new routines use the FPU to either get the complete result
(32-bit routines; 64-bit routines with small arguments), or an
estimate at the result from which we can refine the correct
integral result.

In tests on an ev67 machine, in the case we can use the FPU result
directly, execution time is a constant 43 cycles.  This is down
from approximately 80 cycles for similar inputs with the existing
routine.  Further, worst-case inputs (largest / small) cycle count
goes down from ~400 to ~150 cycles.

Now that I've got the basic routines working, I'll probably write
ev6 and ev67 variants of these that make use of the new insns
available to these processors.


r~


        * sysdeps/alpha/Makefile <gnulib> (sysdep_routines): Merge divrem
        variable, add unsigned variants.
        * sysdeps/alpha/divrem.h: Remove file.
        * sysdeps/alpha/div_libc.h: New file.
        * sysdeps/alpha/divl.S: Rewrite from scratch.
        * sysdeps/alpha/reml.S: Likewise.
        * sysdeps/alpha/divq.S: Likewise.
        * sysdeps/alpha/remq.S: Likewise.
        * sysdeps/alpha/divlu.S: New file.
        * sysdeps/alpha/remlu.S: New file.
        * sysdeps/alpha/divqu.S: New file.
        * sysdeps/alpha/remqu.S: New file.

Index: sysdeps/alpha/Makefile
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/alpha/Makefile,v
retrieving revision 1.18
diff -c -p -d -r1.18 Makefile
*** sysdeps/alpha/Makefile	29 Jan 2002 03:53:32 -0000	1.18
--- sysdeps/alpha/Makefile	26 Mar 2004 23:44:04 -0000
*************** sysdep_routines += _mcount
*** 26,32 ****
  endif
  
  ifeq ($(subdir),gnulib)
! sysdep_routines += $(divrem)
  endif
  
  ifeq ($(subdir),string)
--- 26,32 ----
  endif
  
  ifeq ($(subdir),gnulib)
! sysdep_routines += divl divlu divq divqu reml remlu remq remqu
  endif
  
  ifeq ($(subdir),string)
*************** ifeq ($(subdir),elf)
*** 37,44 ****
  # The ld.so startup code cannot use literals until it self-relocates.
  CFLAGS-rtld.c = -mbuild-constants
  endif
- 
- divrem := divl divq reml remq
  
  # For now, build everything with full IEEE math support.
  # TODO: build separate libm and libm-ieee.
--- 37,42 ----
Index: sysdeps/alpha/div_libc.h
===================================================================
RCS file: sysdeps/alpha/div_libc.h
diff -N sysdeps/alpha/div_libc.h
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- sysdeps/alpha/div_libc.h	26 Mar 2004 23:44:04 -0000
***************
*** 0 ****
--- 1,113 ----
+ /* Copyright (C) 2004 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+ 
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+ 
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA.  */
+ 
+ /* Common bits for implementing software divide.  */
+ 
+ #include <sysdep.h>
+ #ifdef __linux__
+ # include <asm/gentrap.h>
+ # include <asm/pal.h>
+ #else
+ # include <machine/pal.h>
+ #endif
+ 
+ /* These are not normal C functions.  Argument registers are t10 and t11;
+    the result goes in t12; the return address is in t9.  Only t12 and AT
+    may be clobbered.  */
+ #define X	t10
+ #define Y	t11
+ #define RV	t12
+ #define RA	t9
+ 
+ /* None of these functions should use implicit anything.  */
+ 	.set	nomacro
+ 	.set	noat
+ 
+ /* Code fragment to invoke _mcount for profiling.  This should be invoked
+    directly after allocation of the stack frame.  */
+ .macro CALL_MCOUNT
+ #ifdef PROF
+ 	stq	ra, 0(sp)
+ 	stq	pv, 8(sp)
+ 	stq	gp, 16(sp)
+ 	cfi_rel_offset (ra, 0)
+ 	cfi_rel_offset (pv, 8)
+ 	cfi_rel_offset (gp, 16)
+ 	br	AT, 1f
+ 	.set	macro
+ 1:	ldgp	gp, 0(AT)
+ 	mov	RA, ra
+ 	lda	AT, _mcount
+ 	jsr	AT, (AT), _mcount
+ 	.set	nomacro
+ 	ldq	ra, 0(sp)
+ 	ldq	pv, 8(sp)
+ 	ldq	gp, 16(sp)
+ 	cfi_restore (ra)
+ 	cfi_restore (pv)
+ 	cfi_restore (gp)
+ 	/* Realign subsequent code with what we'd have without this
+ 	   macro at all.  This means aligned with one arithmetic insn
+ 	   used within the bundle.  */
+ 	.align	4
+ 	nop
+ #endif
+ .endm
+ 
+ /* In order to make the below work, all top-level divide routines must
+    use the same frame size.  */
+ #define FRAME	48
+ 
+ /* Code fragment to generate an integer divide-by-zero fault.  When
+    building libc.so, we arrange for there to be one copy of this code
+    placed late in the dso, such that all branches are forward.  When
+    building libc.a, we use multiple copies to avoid having an out of
+    range branch.  Users should jump to DIVBYZERO.  */
+ 
+ .macro DO_DIVBYZERO
+ #ifdef PIC
+ #define DIVBYZERO	__divbyzero
+ 	.section .gnu.linkonce.t.divbyzero, "ax", @progbits
+ 	.globl	__divbyzero
+ 	.type	__divbyzero, @function
+ 	.usepv	__divbyzero, no
+ 	.hidden	__divbyzero
+ #else
+ #define DIVBYZERO	$divbyzero
+ #endif
+ 
+ 	.align	4
+ DIVBYZERO:
+ 	cfi_startproc
+ 	cfi_return_column (RA)
+ 	cfi_def_cfa_offset (FRAME)
+ 
+ 	mov	a0, RV
+ 	unop
+ 	lda	a0, GEN_INTDIV
+ 	call_pal PAL_gentrap
+ 
+ 	mov	RV, a0
+ 	clr	RV
+ 	lda	sp, FRAME(sp)
+ 	cfi_def_cfa_offset (0)
+ 	ret	$31, (RA), 1
+ 
+ 	cfi_endproc
+ 	.size	DIVBYZERO, .-DIVBYZERO
+ .endm
Index: sysdeps/alpha/divl.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/alpha/divl.S,v
retrieving revision 1.5
diff -c -p -d -r1.5 divl.S
*** sysdeps/alpha/divl.S	6 Nov 1996 04:23:53 -0000	1.5
--- sysdeps/alpha/divl.S	26 Mar 2004 23:44:04 -0000
***************
*** 1,6 ****
! #define IS_REM		0
! #define SIZE		4
! #define UFUNC_NAME	__divlu
! #define SFUNC_NAME	__divl
  
! #include "divrem.h"
--- 1,75 ----
! /* Copyright (C) 2004 Free Software Foundation, Inc.
!    This file is part of the GNU C Library.
  
!    The GNU C Library is free software; you can redistribute it and/or
!    modify it under the terms of the GNU Lesser General Public
!    License as published by the Free Software Foundation; either
!    version 2.1 of the License, or (at your option) any later version.
! 
!    The GNU C Library is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
!    Lesser General Public License for more details.
! 
!    You should have received a copy of the GNU Lesser General Public
!    License along with the GNU C Library; if not, write to the Free
!    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
!    02111-1307 USA.  */
! 
! #include "div_libc.h"
! 
! /* 32-bit signed int divide.  This is not a normal C function.  Argument
!    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
!    be clobbered.
! 
!    The FPU can handle all input values except zero.  Whee!  */
! 
! #ifndef EXTEND
! #define EXTEND(S,D)	sextl S, D
! #endif
! 
! 	.text
! 	.align	4
! 	.globl	__divl
! 	.type	__divl, @function
! 	.usepv	__divl, no
! 
! 	cfi_startproc
! 	cfi_return_column (RA)
! __divl:
! 	lda	sp, -FRAME(sp)
! 	cfi_def_cfa_offset (FRAME)
! 	CALL_MCOUNT
! 	stt	$f0, 0(sp)
! 	stt	$f1, 8(sp)
! 	beq	Y, DIVBYZERO
! 	cfi_rel_offset ($f0, 0)
! 	cfi_rel_offset ($f1, 8)
! 
! 	EXTEND	(X, RV)
! 	EXTEND	(Y, AT)
! 	stq	RV, 16(sp)
! 	stq	AT, 24(sp)
! 
! 	ldt	$f0, 16(sp)
! 	ldt	$f1, 24(sp)
! 	cvtqt	$f0, $f0
! 	cvtqt	$f1, $f1
! 
! 	divt/c	$f0, $f1, $f0
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 16(sp)
! 	ldt	$f0, 0(sp)
! 
! 	ldt	$f1, 8(sp)
! 	ldl	RV, 16(sp)
! 	lda	sp, FRAME(sp)
! 	cfi_restore ($f0)
! 	cfi_restore ($f1)
! 	cfi_def_cfa_offset (0)
! 	ret	$31, (RA), 1
! 
! 	cfi_endproc
! 	.size	__divl, .-__divl
! 
! 	DO_DIVBYZERO
Index: sysdeps/alpha/divlu.S
===================================================================
RCS file: sysdeps/alpha/divlu.S
diff -N sysdeps/alpha/divlu.S
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- sysdeps/alpha/divlu.S	26 Mar 2004 23:44:04 -0000
***************
*** 0 ****
--- 1,4 ----
+ #define UNSIGNED
+ #define EXTEND(S,D)	zapnot S, 15, D
+ #define __divl		__divlu
+ #include <divl.S>
Index: sysdeps/alpha/divq.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/alpha/divq.S,v
retrieving revision 1.5
diff -c -p -d -r1.5 divq.S
*** sysdeps/alpha/divq.S	6 Nov 1996 04:23:54 -0000	1.5
--- sysdeps/alpha/divq.S	26 Mar 2004 23:44:04 -0000
***************
*** 1,6 ****
! #define IS_REM		0
! #define SIZE		8
! #define UFUNC_NAME	__divqu
! #define SFUNC_NAME	__divq
  
! #include "divrem.h"
--- 1,265 ----
! /* Copyright (C) 2004 Free Software Foundation, Inc.
!    This file is part of the GNU C Library.
  
!    The GNU C Library is free software; you can redistribute it and/or
!    modify it under the terms of the GNU Lesser General Public
!    License as published by the Free Software Foundation; either
!    version 2.1 of the License, or (at your option) any later version.
! 
!    The GNU C Library is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
!    Lesser General Public License for more details.
! 
!    You should have received a copy of the GNU Lesser General Public
!    License along with the GNU C Library; if not, write to the Free
!    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
!    02111-1307 USA.  */
! 
! #include "div_libc.h"
! 
! 
! /* 64-bit signed long divide.  These are not normal C functions.  Argument
!    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
!    be clobbered.
! 
!    Theory of operation here is that we can use the FPU divider for virtually
!    all operands that we see: all dividend values between -2**53 and 2**53-1
!    can be computed directly.  Note that divisor values need not be checked
!    against that range because the rounded fp value will be close enough such
!    that the quotient is < 1, which will properly be truncated to zero when we
!    convert back to integer.
! 
!    When the dividend is outside the range for which we can compute exact
!    results, we use the fp quotent as an estimate from which we begin refining
!    an exact integral value.  This reduces the number of iterations in the
!    shift-and-subtract loop significantly.  */
! 
! 	.text
! 	.align	4
! 	.globl	__divq
! 	.type	__divq, @function
! 	.usepv	__divq, no
! 
! 	cfi_startproc
! 	cfi_return_column (RA)
! __divq:
! 	lda	sp, -FRAME(sp)
! 	cfi_def_cfa_offset (FRAME)
! 	CALL_MCOUNT
! 
! 	/* Get the fp divide insn issued as quickly as possible.  After
! 	   that's done, we have at least 22 cycles until its results are
! 	   ready -- all the time in the world to figure out how we're
! 	   going to use the results.  */
! 	stq	X, 16(sp)
! 	stq	Y, 24(sp)
! 	beq	Y, DIVBYZERO
! 
! 	stt	$f0, 0(sp)
! 	stt	$f1, 8(sp)
! 	cfi_rel_offset ($f0, 0)
! 	cfi_rel_offset ($f1, 8)
! 	ldt	$f0, 16(sp)
! 	ldt	$f1, 24(sp)
! 
! 	cvtqt	$f0, $f0
! 	cvtqt	$f1, $f1
! 	divt/c	$f0, $f1, $f0
! 
! 	/* Check to see if X fit in the double as an exact value.  */
! 	sll	X, (64-53), AT
! 	ldt	$f1, 8(sp)
! 	sra	AT, (64-53), AT
! 	cmpeq	X, AT, AT
! 	beq	AT, $x_big
! 
! 	/* If we get here, we're expecting exact results from the division.
! 	   Do nothing else besides convert and clean up.  */
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 16(sp)
! 
! 	ldq	RV, 16(sp)
! 	ldt	$f0, 0(sp)
! 	cfi_restore ($f1)
! 	cfi_remember_state
! 	cfi_restore ($f0)
! 	cfi_def_cfa_offset (0)
! 	lda	sp, FRAME(sp)
! 	ret	$31, (RA), 1
! 
! 	.align	4
! 	cfi_restore_state
! $x_big:
! 	/* If we get here, X is large enough that we don't expect exact
! 	   results, and neither X nor Y got mis-translated for the fp
! 	   division.  Our task is to take the fp result, figure out how
! 	   far it's off from the correct result and compute a fixup.  */
! 	stq	t0, 16(sp)
! 	stq	t1, 24(sp)
! 	stq	t2, 32(sp)
! 	stq	t5, 40(sp)
! 	cfi_rel_offset (t0, 16)
! 	cfi_rel_offset (t1, 24)
! 	cfi_rel_offset (t2, 32)
! 	cfi_rel_offset (t5, 40)
! 
! #define Q	RV		/* quotient */
! #define R	t0		/* remainder */
! #define SY	t1		/* scaled Y */
! #define S	t2		/* scalar */
! #define QY	t3		/* Q*Y */
! 
! 	/* The fixup code below can only handle unsigned values.  */
! 	or	X, Y, AT
! 	mov	$31, t5
! 	blt	AT, $fix_sign_in
! $fix_sign_in_ret1:
! 	cvttq/c	$f0, $f0
! 
! 	stt	$f0, 8(sp)
! 	ldq	Q, 8(sp)
! $fix_sign_in_ret2:
! 	mulq	Q, Y, QY
! 	stq	t4, 8(sp)
! 
! 	ldt	$f0, 0(sp)
! 	unop
! 	cfi_rel_offset (t4, 8)
! 	cfi_restore ($f0)
! 	stq	t3, 0(sp)
! 	unop
! 	cfi_rel_offset (t3, 0)
! 
! 	subq	QY, X, R
! 	mov	Y, SY
! 	mov	1, S
! 	bgt	R, $q_high
! 
! $q_high_ret:
! 	subq	X, QY, R
! 	mov	Y, SY
! 	mov	1, S
! 	bgt	R, $q_low
! 
! $q_low_ret:
! 	ldq	t0, 16(sp)
! 	ldq	t1, 24(sp)
! 	ldq	t2, 32(sp)
! 	bne	t5, $fix_sign_out
! 
! $fix_sign_out_ret:
! 	ldq	t3, 0(sp)
! 	ldq	t4, 8(sp)
! 	ldq	t5, 40(sp)
! 	lda	sp, FRAME(sp)
! 	cfi_remember_state
! 	cfi_restore (t0)
! 	cfi_restore (t1)
! 	cfi_restore (t2)
! 	cfi_restore (t3)
! 	cfi_restore (t4)
! 	cfi_restore (t5)
! 	cfi_def_cfa_offset (0)
! 	ret	$31, (RA), 1
! 
! 	.align	4
! 	cfi_restore_state
! 	/* The quotient that we computed was too large.  We need to reduce
! 	   it by S such that Y*S >= R.  Obviously the closer we get to the
! 	   correct value the better, but overshooting high is ok, as we'll
! 	   fix that up later.  */
! 0:
! 	addq	SY, SY, SY
! 	addq	S, S, S
! $q_high:
! 	cmpult	SY, R, AT
! 	bne	AT, 0b
! 
! 	subq	Q, S, Q
! 	unop
! 	subq	QY, SY, QY
! 	br	$q_high_ret
! 
! 	.align	4
! 	/* The quotient that we computed was too small.  Divide Y by the 
! 	   current remainder (R) and add that to the existing quotient (Q).
! 	   The expectation, of course, is that R is much smaller than X.  */
! 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
! 	   already have a copy of Y in SY and the value 1 in S.  */
! 0:
! 	addq	SY, SY, SY
! 	addq	S, S, S
! $q_low:
! 	cmpult	SY, R, AT
! 	bne	AT, 0b
! 
! 	/* Shift-down and subtract loop.  Each iteration compares our scaled
! 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
! 	   Y's scalar (S) so add it to the quotient (Q).  */
! 2:	addq	Q, S, t3
! 	srl	S, 1, S
! 	cmpule	SY, R, AT
! 	subq	R, SY, t4
! 
! 	cmovne	AT, t3, Q
! 	cmovne	AT, t4, R
! 	srl	SY, 1, SY
! 	bne	S, 2b
! 
! 	br	$q_low_ret
! 
! 	.align	4
! $fix_sign_in:
! 	/* If we got here, then X|Y is negative.  Need to adjust everything
! 	   such that we're doing unsigned division in the fixup loop.  */
! 	/* T5 records the changes we had to make:
! 		bit 0:	set if result should be negative.
! 		bit 2:	set if X was negated.
! 		bit 3:	set if Y was negated.
! 	*/
! 	xor	X, Y, AT
! 	cmplt	AT, 0, t5
! 	cmplt	X, 0, AT
! 	negq	X, t0
! 
! 	s4addq	AT, t5, t5
! 	cmovne	AT, t0, X
! 	cmplt	Y, 0, AT
! 	negq	Y, t0
! 
! 	s8addq	AT, t5, t5
! 	cmovne	AT, t0, Y
! 	unop
! 	blbc	t5, $fix_sign_in_ret1
! 
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 8(sp)
! 	ldq	Q, 8(sp)
! 	unop
! 
! 	negq	Q, Q
! 	br	$fix_sign_in_ret2
! 
! 	.align	4
! $fix_sign_out:
! 	/* Now we get to undo what we did above.  */
! 	/* ??? Is this really faster than just increasing the size of
! 	   the stack frame and storing X and Y in memory?  */
! 	and	t5, 8, AT
! 	negq	Y, t4
! 	cmovne	AT, t4, Y
! 
! 	and	t5, 4, AT
! 	negq	X, t4
! 	cmovne	AT, t4, X
! 
! 	negq	RV, t4
! 	cmovlbs	t5, t4, RV
! 
! 	br	$fix_sign_out_ret
! 
! 	cfi_endproc
! 	.size	__divq, .-__divq
! 
! 	DO_DIVBYZERO
Index: sysdeps/alpha/divqu.S
===================================================================
RCS file: sysdeps/alpha/divqu.S
diff -N sysdeps/alpha/divqu.S
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- sysdeps/alpha/divqu.S	26 Mar 2004 23:44:04 -0000
***************
*** 0 ****
--- 1,244 ----
+ /* Copyright (C) 2004 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+ 
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+ 
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA.  */
+ 
+ #include "div_libc.h"
+ 
+ 
+ /* 64-bit unsigned long divide.  These are not normal C functions.  Argument
+    registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+    clobbered.
+ 
+    Theory of operation here is that we can use the FPU divider for virtually
+    all operands that we see: all dividend values between -2**53 and 2**53-1
+    can be computed directly.  Note that divisor values need not be checked
+    against that range because the rounded fp value will be close enough such
+    that the quotient is < 1, which will properly be truncated to zero when we
+    convert back to integer.
+ 
+    When the dividend is outside the range for which we can compute exact
+    results, we use the fp quotent as an estimate from which we begin refining
+    an exact integral value.  This reduces the number of iterations in the
+    shift-and-subtract loop significantly.  */
+ 
+ 	.text
+ 	.align	4
+ 	.globl	__divqu
+ 	.type	__divqu, @function
+ 	.usepv	__divqu, no
+ 
+ 	cfi_startproc
+ 	cfi_return_column (RA)
+ __divqu:
+ 	lda	sp, -FRAME(sp)
+ 	cfi_def_cfa_offset (FRAME)
+ 	CALL_MCOUNT
+ 
+ 	/* Get the fp divide insn issued as quickly as possible.  After
+ 	   that's done, we have at least 22 cycles until its results are
+ 	   ready -- all the time in the world to figure out how we're
+ 	   going to use the results.  */
+ 	stq	X, 16(sp)
+ 	stq	Y, 24(sp)
+ 	beq	Y, DIVBYZERO
+ 
+ 	stt	$f0, 0(sp)
+ 	stt	$f1, 8(sp)
+ 	cfi_rel_offset ($f0, 0)
+ 	cfi_rel_offset ($f1, 8)
+ 	ldt	$f0, 16(sp)
+ 	ldt	$f1, 24(sp)
+ 
+ 	cvtqt	$f0, $f0
+ 	cvtqt	$f1, $f1
+ 	blt	X, $x_is_neg
+ 	divt/c	$f0, $f1, $f0
+ 
+ 	/* Check to see if Y was mis-converted as signed value.  */
+ 	ldt	$f1, 8(sp)
+ 	unop
+ 	nop
+ 	blt	Y, $y_is_neg
+ 
+ 	/* Check to see if X fit in the double as an exact value.  */
+ 	srl	X, 53, AT
+ 	bne	AT, $x_big
+ 
+ 	/* If we get here, we're expecting exact results from the division.
+ 	   Do nothing else besides convert and clean up.  */
+ 	cvttq/c	$f0, $f0
+ 	stt	$f0, 16(sp)
+ 
+ 	ldq	RV, 16(sp)
+ 	ldt	$f0, 0(sp)
+ 	cfi_remember_state
+ 	cfi_restore ($f0)
+ 	cfi_restore ($f1)
+ 	cfi_def_cfa_offset (0)
+ 	lda	sp, FRAME(sp)
+ 	ret	$31, (RA), 1
+ 
+ 	.align	4
+ 	cfi_restore_state
+ $x_is_neg:
+ 	/* If we get here, X is so big that bit 63 is set, which made the
+ 	   conversion come out negative.  Fix it up lest we not even get
+ 	   a good estimate.  */
+ 	ldah	AT, 0x5f80		/* 2**64 as float.  */
+ 	stt	$f2, 24(sp)
+ 	cfi_rel_offset ($f2, 24)
+ 	stl	AT, 16(sp)
+ 	lds	$f2, 16(sp)
+ 
+ 	addt	$f0, $f2, $f0
+ 	unop
+ 	divt/c	$f0, $f1, $f0
+ 	unop
+ 
+ 	/* Ok, we've now the divide issued.  Continue with other checks.  */
+ 	ldt	$f1, 8(sp)
+ 	unop
+ 	ldt	$f2, 24(sp)
+ 	blt	Y, $y_is_neg
+ 	cfi_restore ($f1)
+ 	cfi_restore ($f2)
+ 	cfi_remember_state	/* for y_is_neg */
+ 
+ 	.align	4
+ $x_big:
+ 	/* If we get here, X is large enough that we don't expect exact
+ 	   results, and neither X nor Y got mis-translated for the fp
+ 	   division.  Our task is to take the fp result, figure out how
+ 	   far it's off from the correct result and compute a fixup.  */
+ 	stq	t0, 16(sp)
+ 	stq	t1, 24(sp)
+ 	stq	t2, 32(sp)
+ 	stq	t3, 40(sp)
+ 	cfi_rel_offset (t0, 16)
+ 	cfi_rel_offset (t1, 24)
+ 	cfi_rel_offset (t2, 32)
+ 	cfi_rel_offset (t3, 40)
+ 
+ #define Q	RV		/* quotient */
+ #define R	t0		/* remainder */
+ #define SY	t1		/* scaled Y */
+ #define S	t2		/* scalar */
+ #define QY	t3		/* Q*Y */
+ 
+ 	cvttq/c	$f0, $f0
+ 	stt	$f0, 8(sp)
+ 	ldq	Q, 8(sp)
+ 	mulq	Q, Y, QY
+ 
+ 	stq	t4, 8(sp)
+ 	unop
+ 	ldt	$f0, 0(sp)
+ 	unop
+ 	cfi_rel_offset (t4, 8)
+ 	cfi_restore ($f0)
+ 
+ 	subq	QY, X, R
+ 	mov	Y, SY
+ 	mov	1, S
+ 	bgt	R, $q_high
+ 
+ $q_high_ret:
+ 	subq	X, QY, R
+ 	mov	Y, SY
+ 	mov	1, S
+ 	bgt	R, $q_low
+ 
+ $q_low_ret:
+ 	ldq	t4, 8(sp)
+ 	ldq	t0, 16(sp)
+ 	ldq	t1, 24(sp)
+ 	ldq	t2, 32(sp)
+ 
+ 	ldq	t3, 40(sp)
+ 	lda	sp, FRAME(sp)
+ 	cfi_remember_state
+ 	cfi_restore (t0)
+ 	cfi_restore (t1)
+ 	cfi_restore (t2)
+ 	cfi_restore (t3)
+ 	cfi_restore (t4)
+ 	cfi_def_cfa_offset (0)
+ 	ret	$31, (RA), 1
+ 
+ 	.align	4
+ 	cfi_restore_state
+ 	/* The quotient that we computed was too large.  We need to reduce
+ 	   it by S such that Y*S >= R.  Obviously the closer we get to the
+ 	   correct value the better, but overshooting high is ok, as we'll
+ 	   fix that up later.  */
+ 0:
+ 	addq	SY, SY, SY
+ 	addq	S, S, S
+ $q_high:
+ 	cmpult	SY, R, AT
+ 	bne	AT, 0b
+ 
+ 	subq	Q, S, Q
+ 	unop
+ 	subq	QY, SY, QY
+ 	br	$q_high_ret
+ 
+ 	.align	4
+ 	/* The quotient that we computed was too small.  Divide Y by the 
+ 	   current remainder (R) and add that to the existing quotient (Q).
+ 	   The expectation, of course, is that R is much smaller than X.  */
+ 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+ 	   already have a copy of Y in SY and the value 1 in S.  */
+ 0:
+ 	addq	SY, SY, SY
+ 	addq	S, S, S
+ $q_low:
+ 	cmpult	SY, R, AT
+ 	bne	AT, 0b
+ 
+ 	/* Shift-down and subtract loop.  Each iteration compares our scaled
+ 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ 	   Y's scalar (S) so add it to the quotient (Q).  */
+ 2:	addq	Q, S, t3
+ 	srl	S, 1, S
+ 	cmpule	SY, R, AT
+ 	subq	R, SY, t4
+ 
+ 	cmovne	AT, t3, Q
+ 	cmovne	AT, t4, R
+ 	srl	SY, 1, SY
+ 	bne	S, 2b
+ 
+ 	br	$q_low_ret
+ 
+ 	.align	4
+ 	cfi_restore_state
+ $y_is_neg:
+ 	/* If we get here, Y is so big that bit 63 is set.  The results
+ 	   from the divide will be completely wrong.  Fortunately, the
+ 	   quotient must be either 0 or 1, so just compute it directly.  */
+ 	cmpult	Y, X, RV
+ 	ldt	$f0, 0(sp)
+ 	lda	sp, FRAME(sp)
+ 	cfi_restore ($f0)
+ 	cfi_def_cfa_offset (0)
+ 	ret	$31, (RA), 1
+ 
+ 	cfi_endproc
+ 	.size	__divqu, .-__divqu
+ 
+ 	DO_DIVBYZERO
Index: sysdeps/alpha/divrem.h
===================================================================
RCS file: sysdeps/alpha/divrem.h
diff -N sysdeps/alpha/divrem.h
*** sysdeps/alpha/divrem.h	15 Jun 2002 20:53:37 -0000	1.9
--- /dev/null	1 Jan 1970 00:00:00 -0000
***************
*** 1,225 ****
- /* Copyright (C) 1996,97,2002 Free Software Foundation, Inc.
-    Contributed by David Mosberger (davidm@cs.arizona.edu).
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
- 
-    The GNU C Library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, write to the Free
-    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-    02111-1307 USA.  */
- 
- /* The current Alpha chips don't provide hardware for integer
-    division.  The C compiler expects the functions
- 
- 	__divqu: 64-bit unsigned long divide
- 	__remqu: 64-bit unsigned long remainder
- 	__divqs/__remqs: signed 64-bit
- 	__divlu/__remlu: unsigned 32-bit
- 	__divls/__remls: signed 32-bit
- 
-    These are not normal C functions: instead of the normal calling
-    sequence, these expect their arguments in registers t10 and t11, and
-    return the result in t12 (aka pv).  Register AT may be clobbered
-    (assembly temporary), anything else must be saved.  */
- 
- #include <sysdep.h>
- 
- #ifdef __linux__
- # include <asm/gentrap.h>
- # include <asm/pal.h>
- #else
- # include <machine/pal.h>
- #endif
- 
- #define mask			v0
- #define divisor			t0
- #define compare			AT
- #define tmp1			t2
- #define tmp2			t3
- #define retaddr			t9
- #define arg1			t10
- #define arg2			t11
- #define result			t12
- 
- #if IS_REM
- # define DIV_ONLY(x,y...)
- # define REM_ONLY(x,y...)	x,##y
- # define modulus		result
- # define quotient		t1
- # define GETSIGN(x)		mov arg1, x
- # define STACK			32
- #else
- # define DIV_ONLY(x,y...)	x,##y
- # define REM_ONLY(x,y...)
- # define modulus		t1
- # define quotient		result
- # define GETSIGN(x)		xor arg1, arg2, x
- # define STACK			48
- #endif
- 
- #if SIZE == 8
- # define LONGIFY(x,y)		mov x,y
- # define SLONGIFY(x,y)		mov x,y
- # define _SLONGIFY(x)
- # define NEG(x,y)		negq x,y
- #else
- # define LONGIFY(x,y)		zapnot x,15,y
- # define SLONGIFY(x,y)		sextl x,y
- # define _SLONGIFY(x)		sextl x,x
- # define NEG(x,y)		negl x,y
- #endif
- 
- 	.set noreorder
- 	.set noat
- 
- 	.ent UFUNC_NAME
- 	.globl UFUNC_NAME
- 
- 	.align 3
- UFUNC_NAME:
- $udiv_entry:
- 	lda	sp, -STACK(sp)
- 	.frame	sp, STACK, retaddr, 0
- #ifdef PROF
- 	stq	ra, 0(sp)
- 	stq	pv, 8(sp)
- 	stq	gp, 16(sp)
- 
- 	br	AT, 1f
- 1:	ldgp	gp, 0(AT)
- 
- 	mov	retaddr, ra
- 	lda	AT, _mcount
- 	jsr	AT, (AT), _mcount
- 
- 	ldq	ra, 0(sp)
- 	ldq	pv, 8(sp)
- 	ldq	gp, 16(sp)
- #endif
- 	.prologue 0
- 
- $udiv:
- 	stq	t0, 0(sp)
- 	LONGIFY	(arg2, divisor)
- 	stq	t1, 8(sp)
- 	LONGIFY	(arg1, modulus)
- 	stq	v0, 16(sp)
- 	clr	quotient
- 	stq	tmp1, 24(sp)
- 	ldiq	mask, 1
- 	DIV_ONLY(stq tmp2,32(sp))
- 
- 	beq	divisor, $divbyzero
- 
- 	.align 3
- #if SIZE == 8
- 	/* Shift divisor left.  */
- 1:	cmpult	divisor, modulus, compare
- 	blt	divisor, 2f
- 	addq	divisor, divisor, divisor
- 	addq	mask, mask, mask
- 	bne	compare, 1b
- 	unop
- 2:
- #else
- 	/* Shift divisor left using 3-bit shifts as we can't overflow.
- 	   This results in looping three times less here, but up to
- 	   two more times later.  Thus using a large shift isn't worth it.  */
- 1:	cmpult	divisor, modulus, compare
- 	s8addq	divisor, zero, divisor
- 	s8addq	mask, zero, mask
- 	bne	compare, 1b
- #endif
- 
- 	/* Now go back to the right.  */
- 3:	DIV_ONLY(addq quotient, mask, tmp2)
- 	srl	mask, 1, mask
- 	cmpule	divisor, modulus, compare
- 	subq	modulus, divisor, tmp1
- 	DIV_ONLY(cmovne compare, tmp2, quotient)
- 	srl	divisor, 1, divisor
- 	cmovne	compare, tmp1, modulus
- 	bne	mask, 3b
- 
- $done:	ldq	t0, 0(sp)
- 	ldq	t1, 8(sp)
- 	ldq	v0, 16(sp)
- 	ldq	tmp1, 24(sp)
- 	DIV_ONLY(ldq tmp2, 32(sp))
- 	lda	sp, STACK(sp)
- 	ret	zero, (retaddr), 1
- 
- $divbyzero:
- 	mov	a0, tmp1
- 	ldiq	a0, GEN_INTDIV
- 	call_pal PAL_gentrap
- 	mov	tmp1, a0
- 	clr	result			/* If trap returns, return zero.  */
- 	br	$done
- 
- 	.end UFUNC_NAME
- 
- 	.ent SFUNC_NAME
- 	.globl SFUNC_NAME
- 
- 	.align 3
- SFUNC_NAME:
- 	lda	sp, -STACK(sp)
- 	.frame	sp, STACK, retaddr, 0
- #ifdef PROF
- 	stq	ra, 0(sp)
- 	stq	pv, 8(sp)
- 	stq	gp, 16(sp)
- 
- 	br	AT, 1f
- 1:	ldgp	gp, 0(AT)
- 
- 	mov	retaddr, ra
- 	jsr	AT, _mcount
- 
- 	ldq	ra, 0(sp)
- 	ldq	pv, 8(sp)
- 	ldq	gp, 16(sp)
- #endif
- 	.prologue 0
- 
- 	or	arg1, arg2, AT
- 	_SLONGIFY(AT)
- 	bge	AT, $udiv		/* don't need to mess with signs */
- 
- 	/* Save originals and find absolute values.  */
- 	stq	arg1, 0(sp)
- 	NEG	(arg1, AT)
- 	stq	arg2, 8(sp)
- 	cmovge	AT, AT, arg1
- 	stq	retaddr, 16(sp)
- 	NEG	(arg2, AT)
- 	stq	tmp1, 24(sp)
- 	cmovge	AT, AT, arg2
- 
- 	/* Do the unsigned division.  */
- 	bsr	retaddr, $udiv_entry
- 
- 	/* Restore originals and adjust the sign of the result.  */
- 	ldq	arg1, 0(sp)
- 	ldq	arg2, 8(sp)
- 	GETSIGN	(AT)
- 	NEG	(result, tmp1)
- 	_SLONGIFY(AT)
- 	ldq	retaddr, 16(sp)
- 	cmovlt	AT, tmp1, result
- 	ldq	tmp1, 24(sp)
- 
- 	lda	sp, STACK(sp)
- 	ret	zero, (retaddr), 1
- 
- 	.end	SFUNC_NAME
--- 0 ----
Index: sysdeps/alpha/reml.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/alpha/reml.S,v
retrieving revision 1.5
diff -c -p -d -r1.5 reml.S
*** sysdeps/alpha/reml.S	6 Nov 1996 04:24:00 -0000	1.5
--- sysdeps/alpha/reml.S	26 Mar 2004 23:44:04 -0000
***************
*** 1,6 ****
! #define IS_REM		1
! #define SIZE		4
! #define UFUNC_NAME	__remlu
! #define SFUNC_NAME	__reml
  
! #include "divrem.h"
--- 1,80 ----
! /* Copyright (C) 2004 Free Software Foundation, Inc.
!    Contributed by Richard Henderson  <rth@twiddle.net>
!    This file is part of the GNU C Library.
  
!    The GNU C Library is free software; you can redistribute it and/or
!    modify it under the terms of the GNU Lesser General Public
!    License as published by the Free Software Foundation; either
!    version 2.1 of the License, or (at your option) any later version.
! 
!    The GNU C Library is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
!    Lesser General Public License for more details.
! 
!    You should have received a copy of the GNU Lesser General Public
!    License along with the GNU C Library; if not, write to the Free
!    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
!    02111-1307 USA.  */
! 
! #include "div_libc.h"
! 
! /* 32-bit signed int remainder.  This is not a normal C function.  Argument
!    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
!    be clobbered.
! 
!    The FPU can handle the division for all input values except zero.
!    All we have to do is compute the remainder via multiply-and-subtract.  */
! 
! #ifndef EXTEND
! #define EXTEND(S,D)	sextl S, D
! #endif
! 
! 	.text
! 	.align	4
! 	.globl	__reml
! 	.type	__reml, @function
! 	.usepv	__reml, no
! 
! 	cfi_startproc
! 	cfi_return_column (RA)
! __reml:
! 	lda	sp, -FRAME(sp)
! 	cfi_def_cfa_offset (FRAME)
! 	CALL_MCOUNT
! 	stt	$f0, 0(sp)
! 	stt	$f1, 8(sp)
! 	beq	Y, DIVBYZERO
! 	cfi_rel_offset ($f0, 0)
! 	cfi_rel_offset ($f1, 8)
! 
! 	EXTEND	(X, RV)
! 	EXTEND	(Y, AT)
! 	stq	RV, 16(sp)
! 	stq	AT, 24(sp)
! 
! 	ldt	$f0, 16(sp)
! 	ldt	$f1, 24(sp)
! 	cvtqt	$f0, $f0
! 	cvtqt	$f1, $f1
! 
! 	divt/c	$f0, $f1, $f0
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 16(sp)
! 	ldq	RV, 16(sp)
! 
! 	ldt	$f0, 0(sp)
! 	mull	RV, Y, RV
! 	ldt	$f1, 8(sp)
! 	lda	sp, FRAME(sp)
! 	cfi_restore ($f0)
! 	cfi_restore ($f1)
! 	cfi_def_cfa_offset (0)
! 
! 	subl	X, RV, RV
! 	ret	$31, (RA), 1
! 
! 	cfi_endproc
! 	.size	__reml, .-__reml
! 
! 	DO_DIVBYZERO
Index: sysdeps/alpha/remlu.S
===================================================================
RCS file: sysdeps/alpha/remlu.S
diff -N sysdeps/alpha/remlu.S
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- sysdeps/alpha/remlu.S	26 Mar 2004 23:44:04 -0000
***************
*** 0 ****
--- 1,4 ----
+ #define UNSIGNED
+ #define EXTEND(S,D)	zapnot S, 15, D
+ #define __reml		__remlu
+ #include <reml.S>
Index: sysdeps/alpha/remq.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/alpha/remq.S,v
retrieving revision 1.5
diff -c -p -d -r1.5 remq.S
*** sysdeps/alpha/remq.S	6 Nov 1996 04:24:01 -0000	1.5
--- sysdeps/alpha/remq.S	26 Mar 2004 23:44:04 -0000
***************
*** 1,6 ****
! #define IS_REM		1
! #define SIZE		8
! #define UFUNC_NAME	__remqu
! #define SFUNC_NAME	__remq
  
! #include "divrem.h"
--- 1,261 ----
! /* Copyright (C) 2004 Free Software Foundation, Inc.
!    This file is part of the GNU C Library.
  
!    The GNU C Library is free software; you can redistribute it and/or
!    modify it under the terms of the GNU Lesser General Public
!    License as published by the Free Software Foundation; either
!    version 2.1 of the License, or (at your option) any later version.
! 
!    The GNU C Library is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
!    Lesser General Public License for more details.
! 
!    You should have received a copy of the GNU Lesser General Public
!    License along with the GNU C Library; if not, write to the Free
!    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
!    02111-1307 USA.  */
! 
! #include "div_libc.h"
! 
! 
! /* 64-bit signed long remainder.  These are not normal C functions.  Argument
!    registers are t10 and t11, the result goes in t12.  Only t12 and AT may
!    be clobbered.
! 
!    Theory of operation here is that we can use the FPU divider for virtually
!    all operands that we see: all dividend values between -2**53 and 2**53-1
!    can be computed directly.  Note that divisor values need not be checked
!    against that range because the rounded fp value will be close enough such
!    that the quotient is < 1, which will properly be truncated to zero when we
!    convert back to integer.
! 
!    When the dividend is outside the range for which we can compute exact
!    results, we use the fp quotent as an estimate from which we begin refining
!    an exact integral value.  This reduces the number of iterations in the
!    shift-and-subtract loop significantly.  */
! 
! 	.text
! 	.align	4
! 	.globl	__remq
! 	.type	__remq, @function
! 	.usepv	__remq, no
! 
! 	cfi_startproc
! 	cfi_return_column (RA)
! __remq:
! 	lda	sp, -FRAME(sp)
! 	cfi_def_cfa_offset (FRAME)
! 	CALL_MCOUNT
! 
! 	/* Get the fp divide insn issued as quickly as possible.  After
! 	   that's done, we have at least 22 cycles until its results are
! 	   ready -- all the time in the world to figure out how we're
! 	   going to use the results.  */
! 	stq	X, 16(sp)
! 	stq	Y, 24(sp)
! 	beq	Y, DIVBYZERO
! 
! 	stt	$f0, 0(sp)
! 	stt	$f1, 8(sp)
! 	cfi_rel_offset ($f0, 0)
! 	cfi_rel_offset ($f1, 8)
! 	ldt	$f0, 16(sp)
! 	ldt	$f1, 24(sp)
! 
! 	cvtqt	$f0, $f0
! 	cvtqt	$f1, $f1
! 	divt/c	$f0, $f1, $f0
! 
! 	/* Check to see if X fit in the double as an exact value.  */
! 	sll	X, (64-53), AT
! 	ldt	$f1, 8(sp)
! 	sra	AT, (64-53), AT
! 	cmpeq	X, AT, AT
! 	beq	AT, $x_big
! 
! 	/* If we get here, we're expecting exact results from the division.
! 	   Do nothing else besides convert, compute remainder, clean up.  */
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 16(sp)
! 
! 	ldq	AT, 16(sp)
! 	mulq	AT, Y, AT
! 	ldt	$f0, 0(sp)
! 	cfi_restore ($f1)
! 	cfi_remember_state
! 	cfi_restore ($f0)
! 	cfi_def_cfa_offset (0)
! 	lda	sp, FRAME(sp)
! 
! 	subq	X, AT, RV
! 	ret	$31, (RA), 1
! 
! 	.align	4
! 	cfi_restore_state
! $x_big:
! 	/* If we get here, X is large enough that we don't expect exact
! 	   results, and neither X nor Y got mis-translated for the fp
! 	   division.  Our task is to take the fp result, figure out how
! 	   far it's off from the correct result and compute a fixup.  */
! 	stq	t0, 16(sp)
! 	stq	t1, 24(sp)
! 	stq	t2, 32(sp)
! 	stq	t5, 40(sp)
! 	cfi_rel_offset (t0, 16)
! 	cfi_rel_offset (t1, 24)
! 	cfi_rel_offset (t2, 32)
! 	cfi_rel_offset (t5, 40)
! 
! #define Q	t0		/* quotient */
! #define R	RV		/* remainder */
! #define SY	t1		/* scaled Y */
! #define S	t2		/* scalar */
! #define QY	t3		/* Q*Y */
! 
! 	/* The fixup code below can only handle unsigned values.  */
! 	or	X, Y, AT
! 	mov	$31, t5
! 	blt	AT, $fix_sign_in
! $fix_sign_in_ret1:
! 	cvttq/c	$f0, $f0
! 
! 	stt	$f0, 8(sp)
! 	ldq	Q, 8(sp)
! $fix_sign_in_ret2:
! 	mulq	Q, Y, QY
! 	stq	t4, 8(sp)
! 
! 	ldt	$f0, 0(sp)
! 	unop
! 	cfi_rel_offset (t4, 8)
! 	cfi_restore ($f0)
! 	stq	t3, 0(sp)
! 	unop
! 	cfi_rel_offset (t3, 0)
! 
! 	subq	QY, X, R
! 	mov	Y, SY
! 	mov	1, S
! 	bgt	R, $q_high
! 
! $q_high_ret:
! 	subq	X, QY, R
! 	mov	Y, SY
! 	mov	1, S
! 	bgt	R, $q_low
! 
! $q_low_ret:
! 	ldq	t0, 16(sp)
! 	ldq	t1, 24(sp)
! 	ldq	t2, 32(sp)
! 	bne	t5, $fix_sign_out
! 
! $fix_sign_out_ret:
! 	ldq	t3, 0(sp)
! 	ldq	t4, 8(sp)
! 	ldq	t5, 40(sp)
! 	lda	sp, FRAME(sp)
! 	cfi_remember_state
! 	cfi_restore (t0)
! 	cfi_restore (t1)
! 	cfi_restore (t2)
! 	cfi_restore (t3)
! 	cfi_restore (t4)
! 	cfi_restore (t5)
! 	cfi_def_cfa_offset (0)
! 	ret	$31, (RA), 1
! 
! 	.align	4
! 	cfi_restore_state
! 	/* The quotient that we computed was too large.  We need to reduce
! 	   it by S such that Y*S >= R.  Obviously the closer we get to the
! 	   correct value the better, but overshooting high is ok, as we'll
! 	   fix that up later.  */
! 0:
! 	addq	SY, SY, SY
! 	addq	S, S, S
! $q_high:
! 	cmpult	SY, R, AT
! 	bne	AT, 0b
! 
! 	subq	Q, S, Q
! 	unop
! 	subq	QY, SY, QY
! 	br	$q_high_ret
! 
! 	.align	4
! 	/* The quotient that we computed was too small.  Divide Y by the 
! 	   current remainder (R) and add that to the existing quotient (Q).
! 	   The expectation, of course, is that R is much smaller than X.  */
! 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
! 	   already have a copy of Y in SY and the value 1 in S.  */
! 0:
! 	addq	SY, SY, SY
! 	addq	S, S, S
! $q_low:
! 	cmpult	SY, R, AT
! 	bne	AT, 0b
! 
! 	/* Shift-down and subtract loop.  Each iteration compares our scaled
! 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
! 	   Y's scalar (S) so add it to the quotient (Q).  */
! 2:	addq	Q, S, t3
! 	srl	S, 1, S
! 	cmpule	SY, R, AT
! 	subq	R, SY, t4
! 
! 	cmovne	AT, t3, Q
! 	cmovne	AT, t4, R
! 	srl	SY, 1, SY
! 	bne	S, 2b
! 
! 	br	$q_low_ret
! 
! 	.align	4
! $fix_sign_in:
! 	/* If we got here, then X|Y is negative.  Need to adjust everything
! 	   such that we're doing unsigned division in the fixup loop.  */
! 	/* T5 records the changes we had to make:
! 		bit 0:	set if X was negated.  Note that the sign of the
! 			remainder follows the sign of the divisor.
! 		bit 2:	set if Y was negated.
! 	*/
! 	xor	X, Y, t1
! 	cmplt	X, 0, t5
! 	negq	X, t0
! 	cmovne	t5, t0, X
! 
! 	cmplt	Y, 0, AT
! 	negq	Y, t0
! 	s4addq	AT, t5, t5
! 	cmovne	AT, t0, Y
! 
! 	bge	t1, $fix_sign_in_ret1
! 	cvttq/c	$f0, $f0
! 	stt	$f0, 8(sp)
! 	ldq	Q, 8(sp)
! 
! 	negq	Q, Q
! 	br	$fix_sign_in_ret2
! 
! 	.align	4
! $fix_sign_out:
! 	/* Now we get to undo what we did above.  */
! 	/* ??? Is this really faster than just increasing the size of
! 	   the stack frame and storing X and Y in memory?  */
! 	and	t5, 4, AT
! 	negq	Y, t4
! 	cmovne	AT, t4, Y
! 
! 	negq	X, t4
! 	cmovlbs	t5, t4, X
! 	negq	RV, t4
! 	cmovlbs	t5, t4, RV
! 
! 	br	$fix_sign_out_ret
! 
! 	cfi_endproc
! 	.size	__remq, .-__remq
! 
! 	DO_DIVBYZERO
Index: sysdeps/alpha/remqu.S
===================================================================
RCS file: sysdeps/alpha/remqu.S
diff -N sysdeps/alpha/remqu.S
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- sysdeps/alpha/remqu.S	26 Mar 2004 23:44:05 -0000
***************
*** 0 ****
--- 1,251 ----
+ /* Copyright (C) 2004 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+ 
+    The GNU C Library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+ 
+    You should have received a copy of the GNU Lesser General Public
+    License along with the GNU C Library; if not, write to the Free
+    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+    02111-1307 USA.  */
+ 
+ #include "div_libc.h"
+ 
+ 
+ /* 64-bit unsigned long remainder.  These are not normal C functions.  Argument
+    registers are t10 and t11, the result goes in t12.  Only t12 and AT may be
+    clobbered.
+ 
+    Theory of operation here is that we can use the FPU divider for virtually
+    all operands that we see: all dividend values between -2**53 and 2**53-1
+    can be computed directly.  Note that divisor values need not be checked
+    against that range because the rounded fp value will be close enough such
+    that the quotient is < 1, which will properly be truncated to zero when we
+    convert back to integer.
+ 
+    When the dividend is outside the range for which we can compute exact
+    results, we use the fp quotent as an estimate from which we begin refining
+    an exact integral value.  This reduces the number of iterations in the
+    shift-and-subtract loop significantly.  */
+ 
+ 	.text
+ 	.align	4
+ 	.globl	__remqu
+ 	.type	__remqu, @function
+ 	.usepv	__remqu, no
+ 
+ 	cfi_startproc
+ 	cfi_return_column (RA)
+ __remqu:
+ 	lda	sp, -FRAME(sp)
+ 	cfi_def_cfa_offset (FRAME)
+ 	CALL_MCOUNT
+ 
+ 	/* Get the fp divide insn issued as quickly as possible.  After
+ 	   that's done, we have at least 22 cycles until its results are
+ 	   ready -- all the time in the world to figure out how we're
+ 	   going to use the results.  */
+ 	stq	X, 16(sp)
+ 	stq	Y, 24(sp)
+ 	beq	Y, DIVBYZERO
+ 
+ 	stt	$f0, 0(sp)
+ 	stt	$f1, 8(sp)
+ 	cfi_rel_offset ($f0, 0)
+ 	cfi_rel_offset ($f1, 8)
+ 	ldt	$f0, 16(sp)
+ 	ldt	$f1, 24(sp)
+ 
+ 	cvtqt	$f0, $f0
+ 	cvtqt	$f1, $f1
+ 	blt	X, $x_is_neg
+ 	divt/c	$f0, $f1, $f0
+ 
+ 	/* Check to see if Y was mis-converted as signed value.  */
+ 	ldt	$f1, 8(sp)
+ 	unop
+ 	nop
+ 	blt	Y, $y_is_neg
+ 
+ 	/* Check to see if X fit in the double as an exact value.  */
+ 	srl	X, 53, AT
+ 	bne	AT, $x_big
+ 
+ 	/* If we get here, we're expecting exact results from the division.
+ 	   Do nothing else besides convert, compute remainder, clean up.  */
+ 	cvttq/c	$f0, $f0
+ 	stt	$f0, 16(sp)
+ 
+ 	ldq	AT, 16(sp)
+ 	mulq	AT, Y, AT
+ 	ldt	$f0, 0(sp)
+ 	lda	sp, FRAME(sp)
+ 	cfi_remember_state
+ 	cfi_restore ($f0)
+ 	cfi_restore ($f1)
+ 	cfi_def_cfa_offset (0)
+ 
+ 	subq	X, AT, RV
+ 	ret	$31, (RA), 1
+ 
+ 	.align	4
+ 	cfi_restore_state
+ $x_is_neg:
+ 	/* If we get here, X is so big that bit 63 is set, which made the
+ 	   conversion come out negative.  Fix it up lest we not even get
+ 	   a good estimate.  */
+ 	ldah	AT, 0x5f80		/* 2**64 as float.  */
+ 	stt	$f2, 24(sp)
+ 	cfi_rel_offset ($f2, 24)
+ 	stl	AT, 16(sp)
+ 	lds	$f2, 16(sp)
+ 
+ 	addt	$f0, $f2, $f0
+ 	unop
+ 	divt/c	$f0, $f1, $f0
+ 	unop
+ 
+ 	/* Ok, we've now the divide issued.  Continue with other checks.  */
+ 	ldt	$f1, 8(sp)
+ 	unop
+ 	ldt	$f2, 24(sp)
+ 	blt	Y, $y_is_neg
+ 	cfi_restore ($f1)
+ 	cfi_restore ($f2)
+ 	cfi_remember_state	/* for y_is_neg */
+ 
+ 	.align	4
+ $x_big:
+ 	/* If we get here, X is large enough that we don't expect exact
+ 	   results, and neither X nor Y got mis-translated for the fp
+ 	   division.  Our task is to take the fp result, figure out how
+ 	   far it's off from the correct result and compute a fixup.  */
+ 	stq	t0, 16(sp)
+ 	stq	t1, 24(sp)
+ 	stq	t2, 32(sp)
+ 	stq	t3, 40(sp)
+ 	cfi_rel_offset (t0, 16)
+ 	cfi_rel_offset (t1, 24)
+ 	cfi_rel_offset (t2, 32)
+ 	cfi_rel_offset (t3, 40)
+ 
+ #define Q	t0		/* quotient */
+ #define R	RV		/* remainder */
+ #define SY	t1		/* scaled Y */
+ #define S	t2		/* scalar */
+ #define QY	t3		/* Q*Y */
+ 
+ 	cvttq/c	$f0, $f0
+ 	stt	$f0, 8(sp)
+ 	ldq	Q, 8(sp)
+ 	mulq	Q, Y, QY
+ 
+ 	stq	t4, 8(sp)
+ 	unop
+ 	ldt	$f0, 0(sp)
+ 	unop
+ 	cfi_rel_offset (t4, 8)
+ 	cfi_restore ($f0)
+ 
+ 	subq	QY, X, R
+ 	mov	Y, SY
+ 	mov	1, S
+ 	bgt	R, $q_high
+ 
+ $q_high_ret:
+ 	subq	X, QY, R
+ 	mov	Y, SY
+ 	mov	1, S
+ 	bgt	R, $q_low
+ 
+ $q_low_ret:
+ 	ldq	t4, 8(sp)
+ 	ldq	t0, 16(sp)
+ 	ldq	t1, 24(sp)
+ 	ldq	t2, 32(sp)
+ 
+ 	ldq	t3, 40(sp)
+ 	lda	sp, FRAME(sp)
+ 	cfi_remember_state
+ 	cfi_restore (t0)
+ 	cfi_restore (t1)
+ 	cfi_restore (t2)
+ 	cfi_restore (t3)
+ 	cfi_restore (t4)
+ 	cfi_def_cfa_offset (0)
+ 	ret	$31, (RA), 1
+ 
+ 	.align	4
+ 	cfi_restore_state
+ 	/* The quotient that we computed was too large.  We need to reduce
+ 	   it by S such that Y*S >= R.  Obviously the closer we get to the
+ 	   correct value the better, but overshooting high is ok, as we'll
+ 	   fix that up later.  */
+ 0:
+ 	addq	SY, SY, SY
+ 	addq	S, S, S
+ $q_high:
+ 	cmpult	SY, R, AT
+ 	bne	AT, 0b
+ 
+ 	subq	Q, S, Q
+ 	unop
+ 	subq	QY, SY, QY
+ 	br	$q_high_ret
+ 
+ 	.align	4
+ 	/* The quotient that we computed was too small.  Divide Y by the 
+ 	   current remainder (R) and add that to the existing quotient (Q).
+ 	   The expectation, of course, is that R is much smaller than X.  */
+ 	/* Begin with a shift-up loop.  Compute S such that Y*S >= R.  We
+ 	   already have a copy of Y in SY and the value 1 in S.  */
+ 0:
+ 	addq	SY, SY, SY
+ 	addq	S, S, S
+ $q_low:
+ 	cmpult	SY, R, AT
+ 	bne	AT, 0b
+ 
+ 	/* Shift-down and subtract loop.  Each iteration compares our scaled
+ 	   Y (SY) with the remainder (R); if SY <= R then X is divisible by
+ 	   Y's scalar (S) so add it to the quotient (Q).  */
+ 2:	addq	Q, S, t3
+ 	srl	S, 1, S
+ 	cmpule	SY, R, AT
+ 	subq	R, SY, t4
+ 
+ 	cmovne	AT, t3, Q
+ 	cmovne	AT, t4, R
+ 	srl	SY, 1, SY
+ 	bne	S, 2b
+ 
+ 	br	$q_low_ret
+ 
+ 	.align	4
+ 	cfi_restore_state
+ $y_is_neg:
+ 	/* If we get here, Y is so big that bit 63 is set.  The results
+ 	   from the divide will be completely wrong.  Fortunately, the
+ 	   quotient must be either 0 or 1, so the remainder must be X
+ 	   or X-Y, so just compute it directly.  */
+ 	cmpult	Y, X, AT
+ 	subq	X, Y, RV
+ 	ldt	$f0, 0(sp)
+ 	cmoveq	AT, X, RV
+ 
+ 	lda	sp, FRAME(sp)
+ 	cfi_restore ($f0)
+ 	cfi_def_cfa_offset (0)
+ 	ret	$31, (RA), 1
+ 
+ 	cfi_endproc
+ 	.size	__remqu, .-__remqu
+ 
+ 	DO_DIVBYZERO


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]