This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch azanella/ppc-little-endian created. glibc-2.18-138-gca207b1


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, azanella/ppc-little-endian has been created
        at  ca207b1888d76d70dd1e1c87197871d6aeb0b116 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ca207b1888d76d70dd1e1c87197871d6aeb0b116

commit ca207b1888d76d70dd1e1c87197871d6aeb0b116
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 9 16:35:26 2013 -0500

    powerpc: LE configure patch
    
    This adds the basic configury bits for powerpc64le and powerpcle.  I
    didn't see a way of adding *.abilist for powerpc64le different to the
    powerpc64 versions without setting up a whole directory structure for
    powerpc64le.

diff --git a/configure b/configure
index 2122583..0a0dac6 100755
--- a/configure
+++ b/configure
@@ -3959,7 +3959,9 @@ fi
 test -n "$base_machine" || case "$machine" in
 i[4567]86)	base_machine=i386 machine=i386/$machine ;;
 powerpc)	base_machine=powerpc machine=powerpc/powerpc32 ;;
+powerpcle)	base_machine=powerpc machine=powerpc/powerpc32le ;;
 powerpc64)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc64le)	base_machine=powerpc machine=powerpc/powerpc64le ;;
 s390)           base_machine=s390 machine=s390/s390-32 ;;
 s390x)          base_machine=s390 machine=s390/s390-64 ;;
 sh3*)		base_machine=sh machine=sh/sh3 ;;
diff --git a/configure.in b/configure.in
index be13308..8d6b5df 100644
--- a/configure.in
+++ b/configure.in
@@ -583,7 +583,9 @@ changequote(,)dnl
 test -n "$base_machine" || case "$machine" in
 i[4567]86)	base_machine=i386 machine=i386/$machine ;;
 powerpc)	base_machine=powerpc machine=powerpc/powerpc32 ;;
+powerpcle)	base_machine=powerpc machine=powerpc/powerpc32le ;;
 powerpc64)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc64le)	base_machine=powerpc machine=powerpc/powerpc64le ;;
 s390)           base_machine=s390 machine=s390/s390-32 ;;
 s390x)          base_machine=s390 machine=s390/s390-64 ;;
 sh3*)		base_machine=sh machine=sh/sh3 ;;
diff --git a/nptl/shlib-versions b/nptl/shlib-versions
index e49e7ca..eca9514 100644
--- a/nptl/shlib-versions
+++ b/nptl/shlib-versions
@@ -2,4 +2,5 @@ sparc64.*-.*-linux.*	libpthread=0		GLIBC_2.2
 sh.*-.*-linux.*		libpthread=0		GLIBC_2.2
 s390x-.*-linux.*	libpthread=0		GLIBC_2.2
 powerpc64-.*-linux.*	libpthread=0		GLIBC_2.3
+powerpc.*le-.*-linux.*	libpthread=0		GLIBC_2.19
 .*-.*-linux.*		libpthread=0
diff --git a/shlib-versions b/shlib-versions
index 9344590..8cdf3db 100644
--- a/shlib-versions
+++ b/shlib-versions
@@ -23,6 +23,7 @@
 
 s390x-.*-linux.*        DEFAULT			GLIBC_2.2
 powerpc64-.*-linux.*	DEFAULT			GLIBC_2.3
+powerpc.*le-.*-linux.*	DEFAULT			GLIBC_2.19
 .*-.*-gnu-gnu.*		DEFAULT			GLIBC_2.2.6
 
 # Configuration		ABI			Identifier for ABI data files
diff --git a/sysdeps/powerpc/powerpc32le/970/Implies b/sysdeps/powerpc/powerpc32le/970/Implies
new file mode 100644
index 0000000..424fa51
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/970/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/970
diff --git a/sysdeps/powerpc/powerpc32le/Implies b/sysdeps/powerpc/powerpc32le/Implies
new file mode 100644
index 0000000..5193432
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32
diff --git a/sysdeps/powerpc/powerpc32le/a2/Implies b/sysdeps/powerpc/powerpc32le/a2/Implies
new file mode 100644
index 0000000..36d9aa0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/a2/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/a2
diff --git a/sysdeps/powerpc/powerpc32le/bits/Implies b/sysdeps/powerpc/powerpc32le/bits/Implies
new file mode 100644
index 0000000..cfb97f7
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/bits/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/bits
diff --git a/sysdeps/powerpc/powerpc32le/cell/Implies b/sysdeps/powerpc/powerpc32le/cell/Implies
new file mode 100644
index 0000000..a172bad
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/cell/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/cell
diff --git a/sysdeps/powerpc/powerpc32le/fpu/Implies b/sysdeps/powerpc/powerpc32le/fpu/Implies
new file mode 100644
index 0000000..a6df48b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power4/Implies b/sysdeps/powerpc/powerpc32le/power4/Implies
new file mode 100644
index 0000000..4e3a983
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power4/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power4
diff --git a/sysdeps/powerpc/powerpc32le/power4/fpu/Implies b/sysdeps/powerpc/powerpc32le/power4/fpu/Implies
new file mode 100644
index 0000000..7c16e45
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power4/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power4/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power5+/Implies b/sysdeps/powerpc/powerpc32le/power5+/Implies
new file mode 100644
index 0000000..59fcd3c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power5+/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power5+
diff --git a/sysdeps/powerpc/powerpc32le/power5+/fpu/Implies b/sysdeps/powerpc/powerpc32le/power5+/fpu/Implies
new file mode 100644
index 0000000..aee4f62
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power5+/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power5+/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power5/Implies b/sysdeps/powerpc/powerpc32le/power5/Implies
new file mode 100644
index 0000000..03899d8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power5/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power5
diff --git a/sysdeps/powerpc/powerpc32le/power5/fpu/Implies b/sysdeps/powerpc/powerpc32le/power5/fpu/Implies
new file mode 100644
index 0000000..819a7d7
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power5/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power6/Implies b/sysdeps/powerpc/powerpc32le/power6/Implies
new file mode 100644
index 0000000..e56df13
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power6/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6
diff --git a/sysdeps/powerpc/powerpc32le/power6/fpu/Implies b/sysdeps/powerpc/powerpc32le/power6/fpu/Implies
new file mode 100644
index 0000000..d53ce25
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power6/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power6x/Implies b/sysdeps/powerpc/powerpc32le/power6x/Implies
new file mode 100644
index 0000000..f0edc67
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power6x/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6x
diff --git a/sysdeps/powerpc/powerpc32le/power6x/fpu/Implies b/sysdeps/powerpc/powerpc32le/power6x/fpu/Implies
new file mode 100644
index 0000000..1c52ba6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power6x/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power6x/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power7/Implies b/sysdeps/powerpc/powerpc32le/power7/Implies
new file mode 100644
index 0000000..30fa752
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power7/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power7
diff --git a/sysdeps/powerpc/powerpc32le/power7/fpu/Implies b/sysdeps/powerpc/powerpc32le/power7/fpu/Implies
new file mode 100644
index 0000000..969e424
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power7/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power7/fpu
diff --git a/sysdeps/powerpc/powerpc32le/power8/Implies b/sysdeps/powerpc/powerpc32le/power8/Implies
new file mode 100644
index 0000000..e3cf748
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32le/power8/Implies
@@ -0,0 +1 @@
+powerpc/powerpc32/power8
diff --git a/sysdeps/powerpc/powerpc64le/970/Implies b/sysdeps/powerpc/powerpc64le/970/Implies
new file mode 100644
index 0000000..b758aa6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/970/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/970
diff --git a/sysdeps/powerpc/powerpc64le/Implies b/sysdeps/powerpc/powerpc64le/Implies
new file mode 100644
index 0000000..a105a32
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64
diff --git a/sysdeps/powerpc/powerpc64le/a2/Implies b/sysdeps/powerpc/powerpc64le/a2/Implies
new file mode 100644
index 0000000..6c02123
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/a2/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/a2
diff --git a/sysdeps/powerpc/powerpc64le/bits/Implies b/sysdeps/powerpc/powerpc64le/bits/Implies
new file mode 100644
index 0000000..ec4465f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/bits/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/bits
diff --git a/sysdeps/powerpc/powerpc64le/cell/Implies b/sysdeps/powerpc/powerpc64le/cell/Implies
new file mode 100644
index 0000000..d6b89b1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/cell/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/cell
diff --git a/sysdeps/powerpc/powerpc64le/fpu/Implies b/sysdeps/powerpc/powerpc64le/fpu/Implies
new file mode 100644
index 0000000..c1f617b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power4/Implies b/sysdeps/powerpc/powerpc64le/power4/Implies
new file mode 100644
index 0000000..ac431fa
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power4/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power4
diff --git a/sysdeps/powerpc/powerpc64le/power4/fpu/Implies b/sysdeps/powerpc/powerpc64le/power4/fpu/Implies
new file mode 100644
index 0000000..558a5fb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power4/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power5+/Implies b/sysdeps/powerpc/powerpc64le/power5+/Implies
new file mode 100644
index 0000000..0f35f37
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power5+/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power5+
diff --git a/sysdeps/powerpc/powerpc64le/power5+/fpu/Implies b/sysdeps/powerpc/powerpc64le/power5+/fpu/Implies
new file mode 100644
index 0000000..f09854e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power5+/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power5+/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power5/Implies b/sysdeps/powerpc/powerpc64le/power5/Implies
new file mode 100644
index 0000000..13b0330
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power5/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64le/power5/fpu/Implies b/sysdeps/powerpc/powerpc64le/power5/fpu/Implies
new file mode 100644
index 0000000..f00c50f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power5/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power5/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power6/Implies b/sysdeps/powerpc/powerpc64le/power6/Implies
new file mode 100644
index 0000000..ccec64d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power6/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6
diff --git a/sysdeps/powerpc/powerpc64le/power6/fpu/Implies b/sysdeps/powerpc/powerpc64le/power6/fpu/Implies
new file mode 100644
index 0000000..30fa176
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power6/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power6x/Implies b/sysdeps/powerpc/powerpc64le/power6x/Implies
new file mode 100644
index 0000000..db8258e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power6x/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6x
diff --git a/sysdeps/powerpc/powerpc64le/power6x/fpu/Implies b/sysdeps/powerpc/powerpc64le/power6x/fpu/Implies
new file mode 100644
index 0000000..8e797d8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power6x/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6x/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power7/Implies b/sysdeps/powerpc/powerpc64le/power7/Implies
new file mode 100644
index 0000000..eedef82
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power7/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7
diff --git a/sysdeps/powerpc/powerpc64le/power7/fpu/Implies b/sysdeps/powerpc/powerpc64le/power7/fpu/Implies
new file mode 100644
index 0000000..8447198
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power7/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power7/fpu
diff --git a/sysdeps/powerpc/powerpc64le/power8/Implies b/sysdeps/powerpc/powerpc64le/power8/Implies
new file mode 100644
index 0000000..3c37351
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64le/power8/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/970/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/970/Implies
new file mode 100644
index 0000000..26418ec
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/970/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/970
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/Implies
new file mode 100644
index 0000000..b1b1153
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/a2/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/a2/Implies
new file mode 100644
index 0000000..8ec8c18
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/a2/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/a2
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/Implies
new file mode 100644
index 0000000..b65cad4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/cell
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/fpu/Implies
new file mode 100644
index 0000000..b45b677
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/cell/fpu/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/cell/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/Implies
new file mode 100644
index 0000000..d2bb927
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/nptl/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/nptl/Implies
new file mode 100644
index 0000000..4c31f46
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/fpu/nptl/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/fpu/nptl
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power4/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power4/Implies
new file mode 100644
index 0000000..f9ccc69
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power4/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power4
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5+/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5+/Implies
new file mode 100644
index 0000000..9f8529c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5+/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power5+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5/Implies
new file mode 100644
index 0000000..93f855a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power5/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power5
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6/Implies
new file mode 100644
index 0000000..ec851f3
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power6
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6x/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6x/Implies
new file mode 100644
index 0000000..1453b95
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power6x/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power6x
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power7/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power7/Implies
new file mode 100644
index 0000000..03940d3
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power7/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power7
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power8/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power8/Implies
new file mode 100644
index 0000000..604cb74
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32le/power8/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc32/power8
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/970/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/970/Implies
new file mode 100644
index 0000000..cb5a86c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/970/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/970
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/Implies
new file mode 100644
index 0000000..4eacd3a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/a2/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/a2/Implies
new file mode 100644
index 0000000..d0a5cc0
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/a2/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/a2
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/Implies
new file mode 100644
index 0000000..10e64ee
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/cell
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/fpu/Implies
new file mode 100644
index 0000000..85e4044
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/cell/fpu/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/cell/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/fpu/Implies
new file mode 100644
index 0000000..5da96db
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/fpu/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/Implies
new file mode 100644
index 0000000..b798c50
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/nptl
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/ld.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/ld.abilist
new file mode 100644
index 0000000..82cdd78
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/ld.abilist
@@ -0,0 +1,11 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ __libc_memalign F
+ __libc_stack_end D 0x8
+ __tls_get_addr F
+ _dl_mcount F
+ _r_debug D 0x28
+ calloc F
+ free F
+ malloc F
+ realloc F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libBrokenLocale.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libBrokenLocale.abilist
new file mode 100644
index 0000000..6b33d95
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libBrokenLocale.abilist
@@ -0,0 +1,3 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ __ctype_get_mb_cur_max F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libanl.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libanl.abilist
new file mode 100644
index 0000000..a3f9e1d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libanl.abilist
@@ -0,0 +1,6 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ gai_cancel F
+ gai_error F
+ gai_suspend F
+ getaddrinfo_a F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libc.abilist
new file mode 100644
index 0000000..e045136
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libc.abilist
@@ -0,0 +1,2178 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ _Exit F
+ _IO_2_1_stderr_ D 0xe0
+ _IO_2_1_stdin_ D 0xe0
+ _IO_2_1_stdout_ D 0xe0
+ _IO_adjust_column F
+ _IO_adjust_wcolumn F
+ _IO_default_doallocate F
+ _IO_default_finish F
+ _IO_default_pbackfail F
+ _IO_default_uflow F
+ _IO_default_xsgetn F
+ _IO_default_xsputn F
+ _IO_do_write F
+ _IO_doallocbuf F
+ _IO_fclose F
+ _IO_fdopen F
+ _IO_feof F
+ _IO_ferror F
+ _IO_fflush F
+ _IO_fgetpos F
+ _IO_fgetpos64 F
+ _IO_fgets F
+ _IO_file_attach F
+ _IO_file_close F
+ _IO_file_close_it F
+ _IO_file_doallocate F
+ _IO_file_finish F
+ _IO_file_fopen F
+ _IO_file_init F
+ _IO_file_jumps D 0xa8
+ _IO_file_open F
+ _IO_file_overflow F
+ _IO_file_read F
+ _IO_file_seek F
+ _IO_file_seekoff F
+ _IO_file_setbuf F
+ _IO_file_stat F
+ _IO_file_sync F
+ _IO_file_underflow F
+ _IO_file_write F
+ _IO_file_xsputn F
+ _IO_flockfile F
+ _IO_flush_all F
+ _IO_flush_all_linebuffered F
+ _IO_fopen F
+ _IO_fprintf F
+ _IO_fputs F
+ _IO_fread F
+ _IO_free_backup_area F
+ _IO_free_wbackup_area F
+ _IO_fsetpos F
+ _IO_fsetpos64 F
+ _IO_ftell F
+ _IO_ftrylockfile F
+ _IO_funlockfile F
+ _IO_fwrite F
+ _IO_getc F
+ _IO_getline F
+ _IO_getline_info F
+ _IO_gets F
+ _IO_init F
+ _IO_init_marker F
+ _IO_init_wmarker F
+ _IO_iter_begin F
+ _IO_iter_end F
+ _IO_iter_file F
+ _IO_iter_next F
+ _IO_least_wmarker F
+ _IO_link_in F
+ _IO_list_all D 0x8
+ _IO_list_lock F
+ _IO_list_resetlock F
+ _IO_list_unlock F
+ _IO_marker_delta F
+ _IO_marker_difference F
+ _IO_padn F
+ _IO_peekc_locked F
+ _IO_popen F
+ _IO_printf F
+ _IO_proc_close F
+ _IO_proc_open F
+ _IO_putc F
+ _IO_puts F
+ _IO_remove_marker F
+ _IO_seekmark F
+ _IO_seekoff F
+ _IO_seekpos F
+ _IO_seekwmark F
+ _IO_setb F
+ _IO_setbuffer F
+ _IO_setvbuf F
+ _IO_sgetn F
+ _IO_sprintf F
+ _IO_sputbackc F
+ _IO_sputbackwc F
+ _IO_sscanf F
+ _IO_str_init_readonly F
+ _IO_str_init_static F
+ _IO_str_overflow F
+ _IO_str_pbackfail F
+ _IO_str_seekoff F
+ _IO_str_underflow F
+ _IO_sungetc F
+ _IO_sungetwc F
+ _IO_switch_to_get_mode F
+ _IO_switch_to_main_wget_area F
+ _IO_switch_to_wbackup_area F
+ _IO_switch_to_wget_mode F
+ _IO_un_link F
+ _IO_ungetc F
+ _IO_unsave_markers F
+ _IO_unsave_wmarkers F
+ _IO_vfprintf F
+ _IO_vfscanf F
+ _IO_vsprintf F
+ _IO_wdefault_doallocate F
+ _IO_wdefault_finish F
+ _IO_wdefault_pbackfail F
+ _IO_wdefault_uflow F
+ _IO_wdefault_xsgetn F
+ _IO_wdefault_xsputn F
+ _IO_wdo_write F
+ _IO_wdoallocbuf F
+ _IO_wfile_jumps D 0xa8
+ _IO_wfile_overflow F
+ _IO_wfile_seekoff F
+ _IO_wfile_sync F
+ _IO_wfile_underflow F
+ _IO_wfile_xsputn F
+ _IO_wmarker_delta F
+ _IO_wsetb F
+ __adjtimex F
+ __after_morecore_hook D 0x8
+ __argz_count F
+ __argz_next F
+ __argz_stringify F
+ __asprintf F
+ __asprintf_chk F
+ __assert F
+ __assert_fail F
+ __assert_perror_fail F
+ __backtrace F
+ __backtrace_symbols F
+ __backtrace_symbols_fd F
+ __bsd_getpgrp F
+ __bzero F
+ __check_rhosts_file D 0x4
+ __chk_fail F
+ __clone F
+ __close F
+ __cmsg_nxthdr F
+ __confstr_chk F
+ __connect F
+ __ctype_b_loc F
+ __ctype_get_mb_cur_max F
+ __ctype_tolower_loc F
+ __ctype_toupper_loc F
+ __curbrk D 0x8
+ __cxa_at_quick_exit F
+ __cxa_atexit F
+ __cxa_finalize F
+ __cxa_thread_atexit_impl F
+ __cyg_profile_func_enter F
+ __cyg_profile_func_exit F
+ __daylight D 0x4
+ __dcgettext F
+ __default_morecore F
+ __dgettext F
+ __dprintf_chk F
+ __dup2 F
+ __duplocale F
+ __endmntent F
+ __environ D 0x8
+ __errno_location F
+ __fbufsize F
+ __fcntl F
+ __fdelt_chk F
+ __fdelt_warn F
+ __ffs F
+ __fgets_chk F
+ __fgets_unlocked_chk F
+ __fgetws_chk F
+ __fgetws_unlocked_chk F
+ __finite F
+ __finitef F
+ __finitel F
+ __flbf F
+ __fork F
+ __fpending F
+ __fprintf_chk F
+ __fpu_control D 0x4
+ __fpurge F
+ __fread_chk F
+ __fread_unlocked_chk F
+ __freadable F
+ __freading F
+ __free_hook D 0x8
+ __freelocale F
+ __fsetlocking F
+ __fwprintf_chk F
+ __fwritable F
+ __fwriting F
+ __fxstat F
+ __fxstat64 F
+ __fxstatat F
+ __fxstatat64 F
+ __getauxval F
+ __getcwd_chk F
+ __getdelim F
+ __getdomainname_chk F
+ __getgroups_chk F
+ __gethostname_chk F
+ __getlogin_r_chk F
+ __getmntent_r F
+ __getpagesize F
+ __getpgid F
+ __getpid F
+ __gets_chk F
+ __gettimeofday F
+ __getwd_chk F
+ __gmtime_r F
+ __h_errno_location F
+ __isalnum_l F
+ __isalpha_l F
+ __isascii_l F
+ __isblank_l F
+ __iscntrl_l F
+ __isctype F
+ __isdigit_l F
+ __isgraph_l F
+ __isinf F
+ __isinff F
+ __isinfl F
+ __islower_l F
+ __isnan F
+ __isnanf F
+ __isnanl F
+ __isoc99_fscanf F
+ __isoc99_fwscanf F
+ __isoc99_scanf F
+ __isoc99_sscanf F
+ __isoc99_swscanf F
+ __isoc99_vfscanf F
+ __isoc99_vfwscanf F
+ __isoc99_vscanf F
+ __isoc99_vsscanf F
+ __isoc99_vswscanf F
+ __isoc99_vwscanf F
+ __isoc99_wscanf F
+ __isprint_l F
+ __ispunct_l F
+ __isspace_l F
+ __isupper_l F
+ __iswalnum_l F
+ __iswalpha_l F
+ __iswblank_l F
+ __iswcntrl_l F
+ __iswctype F
+ __iswctype_l F
+ __iswdigit_l F
+ __iswgraph_l F
+ __iswlower_l F
+ __iswprint_l F
+ __iswpunct_l F
+ __iswspace_l F
+ __iswupper_l F
+ __iswxdigit_l F
+ __isxdigit_l F
+ __ivaliduser F
+ __key_decryptsession_pk_LOCAL D 0x8
+ __key_encryptsession_pk_LOCAL D 0x8
+ __key_gendes_LOCAL D 0x8
+ __libc_allocate_rtsig F
+ __libc_calloc F
+ __libc_current_sigrtmax F
+ __libc_current_sigrtmin F
+ __libc_free F
+ __libc_freeres F
+ __libc_init_first F
+ __libc_mallinfo F
+ __libc_malloc F
+ __libc_mallopt F
+ __libc_memalign F
+ __libc_pvalloc F
+ __libc_realloc F
+ __libc_sa_len F
+ __libc_start_main F
+ __libc_valloc F
+ __longjmp_chk F
+ __lseek F
+ __lxstat F
+ __lxstat64 F
+ __malloc_hook D 0x8
+ __malloc_initialize_hook D 0x8
+ __mbrlen F
+ __mbrtowc F
+ __mbsnrtowcs_chk F
+ __mbsrtowcs_chk F
+ __mbstowcs_chk F
+ __memalign_hook D 0x8
+ __memcpy_chk F
+ __memmove_chk F
+ __mempcpy F
+ __mempcpy_chk F
+ __mempcpy_small F
+ __memset_chk F
+ __monstartup F
+ __morecore D 0x8
+ __nanosleep F
+ __newlocale F
+ __nl_langinfo_l F
+ __nldbl__IO_fprintf F
+ __nldbl__IO_printf F
+ __nldbl__IO_sprintf F
+ __nldbl__IO_sscanf F
+ __nldbl__IO_vfprintf F
+ __nldbl__IO_vfscanf F
+ __nldbl__IO_vsprintf F
+ __nldbl___asprintf F
+ __nldbl___asprintf_chk F
+ __nldbl___dprintf_chk F
+ __nldbl___fprintf_chk F
+ __nldbl___fwprintf_chk F
+ __nldbl___isoc99_fscanf F
+ __nldbl___isoc99_fwscanf F
+ __nldbl___isoc99_scanf F
+ __nldbl___isoc99_sscanf F
+ __nldbl___isoc99_swscanf F
+ __nldbl___isoc99_vfscanf F
+ __nldbl___isoc99_vfwscanf F
+ __nldbl___isoc99_vscanf F
+ __nldbl___isoc99_vsscanf F
+ __nldbl___isoc99_vswscanf F
+ __nldbl___isoc99_vwscanf F
+ __nldbl___isoc99_wscanf F
+ __nldbl___obstack_printf_chk F
+ __nldbl___obstack_vprintf_chk F
+ __nldbl___printf_chk F
+ __nldbl___printf_fp F
+ __nldbl___snprintf_chk F
+ __nldbl___sprintf_chk F
+ __nldbl___strfmon_l F
+ __nldbl___swprintf_chk F
+ __nldbl___syslog_chk F
+ __nldbl___vasprintf_chk F
+ __nldbl___vdprintf_chk F
+ __nldbl___vfprintf_chk F
+ __nldbl___vfscanf F
+ __nldbl___vfwprintf_chk F
+ __nldbl___vprintf_chk F
+ __nldbl___vsnprintf F
+ __nldbl___vsnprintf_chk F
+ __nldbl___vsprintf_chk F
+ __nldbl___vsscanf F
+ __nldbl___vstrfmon F
+ __nldbl___vstrfmon_l F
+ __nldbl___vswprintf_chk F
+ __nldbl___vsyslog_chk F
+ __nldbl___vwprintf_chk F
+ __nldbl___wprintf_chk F
+ __nldbl_asprintf F
+ __nldbl_dprintf F
+ __nldbl_fprintf F
+ __nldbl_fscanf F
+ __nldbl_fwprintf F
+ __nldbl_fwscanf F
+ __nldbl_obstack_printf F
+ __nldbl_obstack_vprintf F
+ __nldbl_printf F
+ __nldbl_printf_size F
+ __nldbl_scanf F
+ __nldbl_snprintf F
+ __nldbl_sprintf F
+ __nldbl_sscanf F
+ __nldbl_strfmon F
+ __nldbl_strfmon_l F
+ __nldbl_swprintf F
+ __nldbl_swscanf F
+ __nldbl_syslog F
+ __nldbl_vasprintf F
+ __nldbl_vdprintf F
+ __nldbl_vfprintf F
+ __nldbl_vfscanf F
+ __nldbl_vfwprintf F
+ __nldbl_vfwscanf F
+ __nldbl_vprintf F
+ __nldbl_vscanf F
+ __nldbl_vsnprintf F
+ __nldbl_vsprintf F
+ __nldbl_vsscanf F
+ __nldbl_vswprintf F
+ __nldbl_vswscanf F
+ __nldbl_vsyslog F
+ __nldbl_vwprintf F
+ __nldbl_vwscanf F
+ __nldbl_wprintf F
+ __nldbl_wscanf F
+ __nss_configure_lookup F
+ __nss_database_lookup F
+ __nss_group_lookup F
+ __nss_hostname_digits_dots F
+ __nss_hosts_lookup F
+ __nss_next F
+ __nss_passwd_lookup F
+ __obstack_printf_chk F
+ __obstack_vprintf_chk F
+ __open F
+ __open64 F
+ __open64_2 F
+ __open_2 F
+ __openat64_2 F
+ __openat_2 F
+ __overflow F
+ __pipe F
+ __poll F
+ __poll_chk F
+ __posix_getopt F
+ __ppc_get_timebase_freq F
+ __ppoll_chk F
+ __pread64 F
+ __pread64_chk F
+ __pread_chk F
+ __printf_chk F
+ __printf_fp F
+ __profile_frequency F
+ __progname D 0x8
+ __progname_full D 0x8
+ __ptsname_r_chk F
+ __pwrite64 F
+ __rawmemchr F
+ __rcmd_errstr D 0x8
+ __read F
+ __read_chk F
+ __readlink_chk F
+ __readlinkat_chk F
+ __realloc_hook D 0x8
+ __realpath_chk F
+ __recv_chk F
+ __recvfrom_chk F
+ __register_atfork F
+ __res_init F
+ __res_nclose F
+ __res_ninit F
+ __res_randomid F
+ __res_state F
+ __rpc_thread_createerr F
+ __rpc_thread_svc_fdset F
+ __rpc_thread_svc_max_pollfd F
+ __rpc_thread_svc_pollfd F
+ __sbrk F
+ __sched_cpualloc F
+ __sched_cpucount F
+ __sched_cpufree F
+ __sched_get_priority_max F
+ __sched_get_priority_min F
+ __sched_getparam F
+ __sched_getscheduler F
+ __sched_setscheduler F
+ __sched_yield F
+ __secure_getenv F
+ __select F
+ __send F
+ __setmntent F
+ __setpgid F
+ __sigaction F
+ __sigaddset F
+ __sigdelset F
+ __sigismember F
+ __signbit F
+ __signbitf F
+ __signbitl F
+ __sigpause F
+ __sigsetjmp F
+ __sigsuspend F
+ __snprintf_chk F
+ __sprintf_chk F
+ __stack_chk_fail F
+ __statfs F
+ __stpcpy F
+ __stpcpy_chk F
+ __stpcpy_small F
+ __stpncpy F
+ __stpncpy_chk F
+ __strcasecmp F
+ __strcasecmp_l F
+ __strcasestr F
+ __strcat_chk F
+ __strcoll_l F
+ __strcpy_chk F
+ __strcpy_small F
+ __strcspn_c1 F
+ __strcspn_c2 F
+ __strcspn_c3 F
+ __strdup F
+ __strerror_r F
+ __strfmon_l F
+ __strftime_l F
+ __strncasecmp_l F
+ __strncat_chk F
+ __strncpy_chk F
+ __strndup F
+ __strpbrk_c2 F
+ __strpbrk_c3 F
+ __strsep_1c F
+ __strsep_2c F
+ __strsep_3c F
+ __strsep_g F
+ __strspn_c1 F
+ __strspn_c2 F
+ __strspn_c3 F
+ __strtod_internal F
+ __strtod_l F
+ __strtof_internal F
+ __strtof_l F
+ __strtok_r F
+ __strtok_r_1c F
+ __strtol_internal F
+ __strtol_l F
+ __strtold_internal F
+ __strtold_l F
+ __strtoll_internal F
+ __strtoll_l F
+ __strtoul_internal F
+ __strtoul_l F
+ __strtoull_internal F
+ __strtoull_l F
+ __strverscmp F
+ __strxfrm_l F
+ __swprintf_chk F
+ __sysconf F
+ __sysctl F
+ __syslog_chk F
+ __sysv_signal F
+ __timezone D 0x8
+ __toascii_l F
+ __tolower_l F
+ __toupper_l F
+ __towctrans F
+ __towctrans_l F
+ __towlower_l F
+ __towupper_l F
+ __ttyname_r_chk F
+ __tzname D 0x10
+ __uflow F
+ __underflow F
+ __uselocale F
+ __vasprintf_chk F
+ __vdprintf_chk F
+ __vfork F
+ __vfprintf_chk F
+ __vfscanf F
+ __vfwprintf_chk F
+ __vprintf_chk F
+ __vsnprintf F
+ __vsnprintf_chk F
+ __vsprintf_chk F
+ __vsscanf F
+ __vswprintf_chk F
+ __vsyslog_chk F
+ __vwprintf_chk F
+ __wait F
+ __waitpid F
+ __wcpcpy_chk F
+ __wcpncpy_chk F
+ __wcrtomb_chk F
+ __wcscasecmp_l F
+ __wcscat_chk F
+ __wcscoll_l F
+ __wcscpy_chk F
+ __wcsftime_l F
+ __wcsncasecmp_l F
+ __wcsncat_chk F
+ __wcsncpy_chk F
+ __wcsnrtombs_chk F
+ __wcsrtombs_chk F
+ __wcstod_internal F
+ __wcstod_l F
+ __wcstof_internal F
+ __wcstof_l F
+ __wcstol_internal F
+ __wcstol_l F
+ __wcstold_internal F
+ __wcstold_l F
+ __wcstoll_internal F
+ __wcstoll_l F
+ __wcstombs_chk F
+ __wcstoul_internal F
+ __wcstoul_l F
+ __wcstoull_internal F
+ __wcstoull_l F
+ __wcsxfrm_l F
+ __wctomb_chk F
+ __wctrans_l F
+ __wctype_l F
+ __wmemcpy_chk F
+ __wmemmove_chk F
+ __wmempcpy_chk F
+ __wmemset_chk F
+ __woverflow F
+ __wprintf_chk F
+ __write F
+ __wuflow F
+ __wunderflow F
+ __xmknod F
+ __xmknodat F
+ __xpg_basename F
+ __xpg_sigpause F
+ __xpg_strerror_r F
+ __xstat F
+ __xstat64 F
+ _authenticate F
+ _dl_mcount_wrapper F
+ _dl_mcount_wrapper_check F
+ _environ D 0x8
+ _exit F
+ _flushlbf F
+ _libc_intl_domainname D 0x5
+ _longjmp F
+ _mcleanup F
+ _mcount F
+ _nl_default_dirname D 0x12
+ _nl_domain_bindings D 0x8
+ _nl_msg_cat_cntr D 0x4
+ _null_auth D 0x18
+ _obstack D 0x8
+ _obstack_allocated_p F
+ _obstack_begin F
+ _obstack_begin_1 F
+ _obstack_free F
+ _obstack_memory_used F
+ _obstack_newchunk F
+ _res D 0x238
+ _res_hconf D 0x48
+ _rpc_dtablesize F
+ _seterr_reply F
+ _setjmp F
+ _sys_errlist D 0x3f0
+ _sys_errlist D 0x420
+ _sys_errlist D 0x438
+ _sys_nerr D 0x4
+ _sys_siglist D 0x200
+ _sys_siglist D 0x208
+ _tolower F
+ _toupper F
+ a64l F
+ abort F
+ abs F
+ accept F
+ accept4 F
+ access F
+ acct F
+ addmntent F
+ addseverity F
+ adjtime F
+ adjtimex F
+ advance F
+ alarm F
+ aligned_alloc F
+ alphasort F
+ alphasort64 F
+ argp_err_exit_status D 0x4
+ argp_error F
+ argp_failure F
+ argp_help F
+ argp_parse F
+ argp_program_bug_address D 0x8
+ argp_program_version D 0x8
+ argp_program_version_hook D 0x8
+ argp_state_help F
+ argp_usage F
+ argz_add F
+ argz_add_sep F
+ argz_append F
+ argz_count F
+ argz_create F
+ argz_create_sep F
+ argz_delete F
+ argz_extract F
+ argz_insert F
+ argz_next F
+ argz_replace F
+ argz_stringify F
+ asctime F
+ asctime_r F
+ asprintf F
+ atof F
+ atoi F
+ atol F
+ atoll F
+ authdes_create F
+ authdes_getucred F
+ authdes_pk_create F
+ authnone_create F
+ authunix_create F
+ authunix_create_default F
+ backtrace F
+ backtrace_symbols F
+ backtrace_symbols_fd F
+ basename F
+ bcmp F
+ bcopy F
+ bdflush F
+ bind F
+ bind_textdomain_codeset F
+ bindresvport F
+ bindtextdomain F
+ brk F
+ bsd_signal F
+ bsearch F
+ btowc F
+ bzero F
+ c16rtomb F
+ c32rtomb F
+ calloc F
+ callrpc F
+ canonicalize_file_name F
+ capget F
+ capset F
+ catclose F
+ catgets F
+ catopen F
+ cbc_crypt F
+ cfgetispeed F
+ cfgetospeed F
+ cfmakeraw F
+ cfree F
+ cfsetispeed F
+ cfsetospeed F
+ cfsetspeed F
+ chdir F
+ chflags F
+ chmod F
+ chown F
+ chroot F
+ clearenv F
+ clearerr F
+ clearerr_unlocked F
+ clnt_broadcast F
+ clnt_create F
+ clnt_pcreateerror F
+ clnt_perrno F
+ clnt_perror F
+ clnt_spcreateerror F
+ clnt_sperrno F
+ clnt_sperror F
+ clntraw_create F
+ clnttcp_create F
+ clntudp_bufcreate F
+ clntudp_create F
+ clntunix_create F
+ clock F
+ clock_adjtime F
+ clock_getcpuclockid F
+ clock_getres F
+ clock_gettime F
+ clock_nanosleep F
+ clock_settime F
+ clone F
+ close F
+ closedir F
+ closelog F
+ confstr F
+ connect F
+ copysign F
+ copysignf F
+ copysignl F
+ creat F
+ creat64 F
+ create_module F
+ ctermid F
+ ctime F
+ ctime_r F
+ cuserid F
+ daemon F
+ daylight D 0x4
+ dcgettext F
+ dcngettext F
+ delete_module F
+ des_setparity F
+ dgettext F
+ difftime F
+ dirfd F
+ dirname F
+ div F
+ dl_iterate_phdr F
+ dngettext F
+ dprintf F
+ drand48 F
+ drand48_r F
+ dup F
+ dup2 F
+ dup3 F
+ duplocale F
+ dysize F
+ eaccess F
+ ecb_crypt F
+ ecvt F
+ ecvt_r F
+ endaliasent F
+ endfsent F
+ endgrent F
+ endhostent F
+ endmntent F
+ endnetent F
+ endnetgrent F
+ endprotoent F
+ endpwent F
+ endrpcent F
+ endservent F
+ endsgent F
+ endspent F
+ endttyent F
+ endusershell F
+ endutent F
+ endutxent F
+ environ D 0x8
+ envz_add F
+ envz_entry F
+ envz_get F
+ envz_merge F
+ envz_remove F
+ envz_strip F
+ epoll_create F
+ epoll_create1 F
+ epoll_ctl F
+ epoll_pwait F
+ epoll_wait F
+ erand48 F
+ erand48_r F
+ err F
+ error F
+ error_at_line F
+ error_message_count D 0x4
+ error_one_per_line D 0x4
+ error_print_progname D 0x8
+ errx F
+ ether_aton F
+ ether_aton_r F
+ ether_hostton F
+ ether_line F
+ ether_ntoa F
+ ether_ntoa_r F
+ ether_ntohost F
+ euidaccess F
+ eventfd F
+ eventfd_read F
+ eventfd_write F
+ execl F
+ execle F
+ execlp F
+ execv F
+ execve F
+ execvp F
+ execvpe F
+ exit F
+ faccessat F
+ fallocate F
+ fallocate64 F
+ fanotify_init F
+ fanotify_mark F
+ fattach F
+ fchdir F
+ fchflags F
+ fchmod F
+ fchmodat F
+ fchown F
+ fchownat F
+ fclose F
+ fcloseall F
+ fcntl F
+ fcvt F
+ fcvt_r F
+ fdatasync F
+ fdetach F
+ fdopen F
+ fdopendir F
+ feof F
+ feof_unlocked F
+ ferror F
+ ferror_unlocked F
+ fexecve F
+ fflush F
+ fflush_unlocked F
+ ffs F
+ ffsl F
+ ffsll F
+ fgetc F
+ fgetc_unlocked F
+ fgetgrent F
+ fgetgrent_r F
+ fgetpos F
+ fgetpos64 F
+ fgetpwent F
+ fgetpwent_r F
+ fgets F
+ fgets_unlocked F
+ fgetsgent F
+ fgetsgent_r F
+ fgetspent F
+ fgetspent_r F
+ fgetwc F
+ fgetwc_unlocked F
+ fgetws F
+ fgetws_unlocked F
+ fgetxattr F
+ fileno F
+ fileno_unlocked F
+ finite F
+ finitef F
+ finitel F
+ flistxattr F
+ flock F
+ flockfile F
+ fmemopen F
+ fmtmsg F
+ fnmatch F
+ fopen F
+ fopen64 F
+ fopencookie F
+ fork F
+ fpathconf F
+ fprintf F
+ fputc F
+ fputc_unlocked F
+ fputs F
+ fputs_unlocked F
+ fputwc F
+ fputwc_unlocked F
+ fputws F
+ fputws_unlocked F
+ fread F
+ fread_unlocked F
+ free F
+ freeaddrinfo F
+ freeifaddrs F
+ freelocale F
+ fremovexattr F
+ freopen F
+ freopen64 F
+ frexp F
+ frexpf F
+ frexpl F
+ fscanf F
+ fseek F
+ fseeko F
+ fseeko64 F
+ fsetpos F
+ fsetpos64 F
+ fsetxattr F
+ fstatfs F
+ fstatfs64 F
+ fstatvfs F
+ fstatvfs64 F
+ fsync F
+ ftell F
+ ftello F
+ ftello64 F
+ ftime F
+ ftok F
+ ftruncate F
+ ftruncate64 F
+ ftrylockfile F
+ fts_children F
+ fts_close F
+ fts_open F
+ fts_read F
+ fts_set F
+ ftw F
+ ftw64 F
+ funlockfile F
+ futimens F
+ futimes F
+ futimesat F
+ fwide F
+ fwprintf F
+ fwrite F
+ fwrite_unlocked F
+ fwscanf F
+ gai_strerror F
+ gcvt F
+ get_avphys_pages F
+ get_current_dir_name F
+ get_kernel_syms F
+ get_myaddress F
+ get_nprocs F
+ get_nprocs_conf F
+ get_phys_pages F
+ getaddrinfo F
+ getaliasbyname F
+ getaliasbyname_r F
+ getaliasent F
+ getaliasent_r F
+ getauxval F
+ getc F
+ getc_unlocked F
+ getchar F
+ getchar_unlocked F
+ getcontext F
+ getcwd F
+ getdate F
+ getdate_err D 0x4
+ getdate_r F
+ getdelim F
+ getdirentries F
+ getdirentries64 F
+ getdomainname F
+ getdtablesize F
+ getegid F
+ getenv F
+ geteuid F
+ getfsent F
+ getfsfile F
+ getfsspec F
+ getgid F
+ getgrent F
+ getgrent_r F
+ getgrgid F
+ getgrgid_r F
+ getgrnam F
+ getgrnam_r F
+ getgrouplist F
+ getgroups F
+ gethostbyaddr F
+ gethostbyaddr_r F
+ gethostbyname F
+ gethostbyname2 F
+ gethostbyname2_r F
+ gethostbyname_r F
+ gethostent F
+ gethostent_r F
+ gethostid F
+ gethostname F
+ getifaddrs F
+ getipv4sourcefilter F
+ getitimer F
+ getline F
+ getloadavg F
+ getlogin F
+ getlogin_r F
+ getmntent F
+ getmntent_r F
+ getmsg F
+ getnameinfo F
+ getnetbyaddr F
+ getnetbyaddr_r F
+ getnetbyname F
+ getnetbyname_r F
+ getnetent F
+ getnetent_r F
+ getnetgrent F
+ getnetgrent_r F
+ getnetname F
+ getopt F
+ getopt_long F
+ getopt_long_only F
+ getpagesize F
+ getpass F
+ getpeername F
+ getpgid F
+ getpgrp F
+ getpid F
+ getpmsg F
+ getppid F
+ getpriority F
+ getprotobyname F
+ getprotobyname_r F
+ getprotobynumber F
+ getprotobynumber_r F
+ getprotoent F
+ getprotoent_r F
+ getpt F
+ getpublickey F
+ getpw F
+ getpwent F
+ getpwent_r F
+ getpwnam F
+ getpwnam_r F
+ getpwuid F
+ getpwuid_r F
+ getresgid F
+ getresuid F
+ getrlimit F
+ getrlimit64 F
+ getrpcbyname F
+ getrpcbyname_r F
+ getrpcbynumber F
+ getrpcbynumber_r F
+ getrpcent F
+ getrpcent_r F
+ getrpcport F
+ getrusage F
+ gets F
+ getsecretkey F
+ getservbyname F
+ getservbyname_r F
+ getservbyport F
+ getservbyport_r F
+ getservent F
+ getservent_r F
+ getsgent F
+ getsgent_r F
+ getsgnam F
+ getsgnam_r F
+ getsid F
+ getsockname F
+ getsockopt F
+ getsourcefilter F
+ getspent F
+ getspent_r F
+ getspnam F
+ getspnam_r F
+ getsubopt F
+ gettext F
+ gettimeofday F
+ getttyent F
+ getttynam F
+ getuid F
+ getusershell F
+ getutent F
+ getutent_r F
+ getutid F
+ getutid_r F
+ getutline F
+ getutline_r F
+ getutmp F
+ getutmpx F
+ getutxent F
+ getutxid F
+ getutxline F
+ getw F
+ getwc F
+ getwc_unlocked F
+ getwchar F
+ getwchar_unlocked F
+ getwd F
+ getxattr F
+ glob F
+ glob64 F
+ glob_pattern_p F
+ globfree F
+ globfree64 F
+ gmtime F
+ gmtime_r F
+ gnu_dev_major F
+ gnu_dev_makedev F
+ gnu_dev_minor F
+ gnu_get_libc_release F
+ gnu_get_libc_version F
+ grantpt F
+ group_member F
+ gsignal F
+ gtty F
+ h_errlist D 0x28
+ h_nerr D 0x4
+ hasmntopt F
+ hcreate F
+ hcreate_r F
+ hdestroy F
+ hdestroy_r F
+ herror F
+ host2netname F
+ hsearch F
+ hsearch_r F
+ hstrerror F
+ htonl F
+ htons F
+ iconv F
+ iconv_close F
+ iconv_open F
+ if_freenameindex F
+ if_indextoname F
+ if_nameindex F
+ if_nametoindex F
+ imaxabs F
+ imaxdiv F
+ in6addr_any D 0x10
+ in6addr_loopback D 0x10
+ index F
+ inet6_opt_append F
+ inet6_opt_find F
+ inet6_opt_finish F
+ inet6_opt_get_val F
+ inet6_opt_init F
+ inet6_opt_next F
+ inet6_opt_set_val F
+ inet6_option_alloc F
+ inet6_option_append F
+ inet6_option_find F
+ inet6_option_init F
+ inet6_option_next F
+ inet6_option_space F
+ inet6_rth_add F
+ inet6_rth_getaddr F
+ inet6_rth_init F
+ inet6_rth_reverse F
+ inet6_rth_segments F
+ inet6_rth_space F
+ inet_addr F
+ inet_aton F
+ inet_lnaof F
+ inet_makeaddr F
+ inet_netof F
+ inet_network F
+ inet_nsap_addr F
+ inet_nsap_ntoa F
+ inet_ntoa F
+ inet_ntop F
+ inet_pton F
+ init_module F
+ initgroups F
+ initstate F
+ initstate_r F
+ innetgr F
+ inotify_add_watch F
+ inotify_init F
+ inotify_init1 F
+ inotify_rm_watch F
+ insque F
+ ioctl F
+ iruserok F
+ iruserok_af F
+ isalnum F
+ isalnum_l F
+ isalpha F
+ isalpha_l F
+ isascii F
+ isastream F
+ isatty F
+ isblank F
+ isblank_l F
+ iscntrl F
+ iscntrl_l F
+ isctype F
+ isdigit F
+ isdigit_l F
+ isfdtype F
+ isgraph F
+ isgraph_l F
+ isinf F
+ isinff F
+ isinfl F
+ islower F
+ islower_l F
+ isnan F
+ isnanf F
+ isnanl F
+ isprint F
+ isprint_l F
+ ispunct F
+ ispunct_l F
+ isspace F
+ isspace_l F
+ isupper F
+ isupper_l F
+ iswalnum F
+ iswalnum_l F
+ iswalpha F
+ iswalpha_l F
+ iswblank F
+ iswblank_l F
+ iswcntrl F
+ iswcntrl_l F
+ iswctype F
+ iswctype_l F
+ iswdigit F
+ iswdigit_l F
+ iswgraph F
+ iswgraph_l F
+ iswlower F
+ iswlower_l F
+ iswprint F
+ iswprint_l F
+ iswpunct F
+ iswpunct_l F
+ iswspace F
+ iswspace_l F
+ iswupper F
+ iswupper_l F
+ iswxdigit F
+ iswxdigit_l F
+ isxdigit F
+ isxdigit_l F
+ jrand48 F
+ jrand48_r F
+ key_decryptsession F
+ key_decryptsession_pk F
+ key_encryptsession F
+ key_encryptsession_pk F
+ key_gendes F
+ key_get_conv F
+ key_secretkey_is_set F
+ key_setnet F
+ key_setsecret F
+ kill F
+ killpg F
+ klogctl F
+ l64a F
+ labs F
+ lchmod F
+ lchown F
+ lckpwdf F
+ lcong48 F
+ lcong48_r F
+ ldexp F
+ ldexpf F
+ ldexpl F
+ ldiv F
+ lfind F
+ lgetxattr F
+ link F
+ linkat F
+ listen F
+ listxattr F
+ llabs F
+ lldiv F
+ llistxattr F
+ llseek F
+ loc1 D 0x8
+ loc2 D 0x8
+ localeconv F
+ localtime F
+ localtime_r F
+ lockf F
+ lockf64 F
+ locs D 0x8
+ longjmp F
+ lrand48 F
+ lrand48_r F
+ lremovexattr F
+ lsearch F
+ lseek F
+ lseek64 F
+ lsetxattr F
+ lutimes F
+ madvise F
+ makecontext F
+ mallinfo F
+ malloc F
+ malloc_get_state F
+ malloc_info F
+ malloc_set_state F
+ malloc_stats F
+ malloc_trim F
+ malloc_usable_size F
+ mallopt F
+ mallwatch D 0x8
+ mblen F
+ mbrlen F
+ mbrtoc16 F
+ mbrtoc32 F
+ mbrtowc F
+ mbsinit F
+ mbsnrtowcs F
+ mbsrtowcs F
+ mbstowcs F
+ mbtowc F
+ mcheck F
+ mcheck_check_all F
+ mcheck_pedantic F
+ memalign F
+ memccpy F
+ memchr F
+ memcmp F
+ memcpy F
+ memfrob F
+ memmem F
+ memmove F
+ mempcpy F
+ memrchr F
+ memset F
+ mincore F
+ mkdir F
+ mkdirat F
+ mkdtemp F
+ mkfifo F
+ mkfifoat F
+ mkostemp F
+ mkostemp64 F
+ mkostemps F
+ mkostemps64 F
+ mkstemp F
+ mkstemp64 F
+ mkstemps F
+ mkstemps64 F
+ mktemp F
+ mktime F
+ mlock F
+ mlockall F
+ mmap F
+ mmap64 F
+ modf F
+ modff F
+ modfl F
+ moncontrol F
+ monstartup F
+ mount F
+ mprobe F
+ mprotect F
+ mrand48 F
+ mrand48_r F
+ mremap F
+ msgctl F
+ msgget F
+ msgrcv F
+ msgsnd F
+ msync F
+ mtrace F
+ munlock F
+ munlockall F
+ munmap F
+ muntrace F
+ name_to_handle_at F
+ nanosleep F
+ netname2host F
+ netname2user F
+ newlocale F
+ nfsservctl F
+ nftw F
+ nftw64 F
+ ngettext F
+ nice F
+ nl_langinfo F
+ nl_langinfo_l F
+ nrand48 F
+ nrand48_r F
+ ntohl F
+ ntohs F
+ ntp_adjtime F
+ ntp_gettime F
+ ntp_gettimex F
+ obstack_alloc_failed_handler D 0x8
+ obstack_exit_failure D 0x4
+ obstack_free F
+ obstack_printf F
+ obstack_vprintf F
+ on_exit F
+ open F
+ open64 F
+ open_by_handle_at F
+ open_memstream F
+ open_wmemstream F
+ openat F
+ openat64 F
+ opendir F
+ openlog F
+ optarg D 0x8
+ opterr D 0x4
+ optind D 0x4
+ optopt D 0x4
+ parse_printf_format F
+ passwd2des F
+ pathconf F
+ pause F
+ pclose F
+ perror F
+ personality F
+ pipe F
+ pipe2 F
+ pivot_root F
+ pmap_getmaps F
+ pmap_getport F
+ pmap_rmtcall F
+ pmap_set F
+ pmap_unset F
+ poll F
+ popen F
+ posix_fadvise F
+ posix_fadvise64 F
+ posix_fallocate F
+ posix_fallocate64 F
+ posix_madvise F
+ posix_memalign F
+ posix_openpt F
+ posix_spawn F
+ posix_spawn_file_actions_addclose F
+ posix_spawn_file_actions_adddup2 F
+ posix_spawn_file_actions_addopen F
+ posix_spawn_file_actions_destroy F
+ posix_spawn_file_actions_init F
+ posix_spawnattr_destroy F
+ posix_spawnattr_getflags F
+ posix_spawnattr_getpgroup F
+ posix_spawnattr_getschedparam F
+ posix_spawnattr_getschedpolicy F
+ posix_spawnattr_getsigdefault F
+ posix_spawnattr_getsigmask F
+ posix_spawnattr_init F
+ posix_spawnattr_setflags F
+ posix_spawnattr_setpgroup F
+ posix_spawnattr_setschedparam F
+ posix_spawnattr_setschedpolicy F
+ posix_spawnattr_setsigdefault F
+ posix_spawnattr_setsigmask F
+ posix_spawnp F
+ ppoll F
+ prctl F
+ pread F
+ pread64 F
+ preadv F
+ preadv64 F
+ printf F
+ printf_size F
+ printf_size_info F
+ prlimit F
+ prlimit64 F
+ process_vm_readv F
+ process_vm_writev F
+ profil F
+ program_invocation_name D 0x8
+ program_invocation_short_name D 0x8
+ pselect F
+ psiginfo F
+ psignal F
+ pthread_attr_destroy F
+ pthread_attr_getdetachstate F
+ pthread_attr_getinheritsched F
+ pthread_attr_getschedparam F
+ pthread_attr_getschedpolicy F
+ pthread_attr_getscope F
+ pthread_attr_init F
+ pthread_attr_setdetachstate F
+ pthread_attr_setinheritsched F
+ pthread_attr_setschedparam F
+ pthread_attr_setschedpolicy F
+ pthread_attr_setscope F
+ pthread_cond_broadcast F
+ pthread_cond_destroy F
+ pthread_cond_init F
+ pthread_cond_signal F
+ pthread_cond_timedwait F
+ pthread_cond_wait F
+ pthread_condattr_destroy F
+ pthread_condattr_init F
+ pthread_equal F
+ pthread_exit F
+ pthread_getschedparam F
+ pthread_mutex_destroy F
+ pthread_mutex_init F
+ pthread_mutex_lock F
+ pthread_mutex_unlock F
+ pthread_self F
+ pthread_setcancelstate F
+ pthread_setcanceltype F
+ pthread_setschedparam F
+ ptrace F
+ ptsname F
+ ptsname_r F
+ putc F
+ putc_unlocked F
+ putchar F
+ putchar_unlocked F
+ putenv F
+ putgrent F
+ putmsg F
+ putpmsg F
+ putpwent F
+ puts F
+ putsgent F
+ putspent F
+ pututline F
+ pututxline F
+ putw F
+ putwc F
+ putwc_unlocked F
+ putwchar F
+ putwchar_unlocked F
+ pvalloc F
+ pwrite F
+ pwrite64 F
+ pwritev F
+ pwritev64 F
+ qecvt F
+ qecvt_r F
+ qfcvt F
+ qfcvt_r F
+ qgcvt F
+ qsort F
+ qsort_r F
+ query_module F
+ quick_exit F
+ quotactl F
+ raise F
+ rand F
+ rand_r F
+ random F
+ random_r F
+ rawmemchr F
+ rcmd F
+ rcmd_af F
+ re_comp F
+ re_compile_fastmap F
+ re_compile_pattern F
+ re_exec F
+ re_match F
+ re_match_2 F
+ re_search F
+ re_search_2 F
+ re_set_registers F
+ re_set_syntax F
+ re_syntax_options D 0x8
+ read F
+ readahead F
+ readdir F
+ readdir64 F
+ readdir64_r F
+ readdir_r F
+ readlink F
+ readlinkat F
+ readv F
+ realloc F
+ realpath F
+ reboot F
+ recv F
+ recvfrom F
+ recvmmsg F
+ recvmsg F
+ regcomp F
+ regerror F
+ regexec F
+ regfree F
+ register_printf_function F
+ register_printf_modifier F
+ register_printf_specifier F
+ register_printf_type F
+ registerrpc F
+ remap_file_pages F
+ remove F
+ removexattr F
+ remque F
+ rename F
+ renameat F
+ revoke F
+ rewind F
+ rewinddir F
+ rexec F
+ rexec_af F
+ rexecoptions D 0x4
+ rindex F
+ rmdir F
+ rpc_createerr D 0x20
+ rpmatch F
+ rresvport F
+ rresvport_af F
+ rtime F
+ ruserok F
+ ruserok_af F
+ ruserpass F
+ sbrk F
+ scalbn F
+ scalbnf F
+ scalbnl F
+ scandir F
+ scandir64 F
+ scandirat F
+ scandirat64 F
+ scanf F
+ sched_get_priority_max F
+ sched_get_priority_min F
+ sched_getaffinity F
+ sched_getcpu F
+ sched_getparam F
+ sched_getscheduler F
+ sched_rr_get_interval F
+ sched_setaffinity F
+ sched_setparam F
+ sched_setscheduler F
+ sched_yield F
+ secure_getenv F
+ seed48 F
+ seed48_r F
+ seekdir F
+ select F
+ semctl F
+ semget F
+ semop F
+ semtimedop F
+ send F
+ sendfile F
+ sendfile64 F
+ sendmmsg F
+ sendmsg F
+ sendto F
+ setaliasent F
+ setbuf F
+ setbuffer F
+ setcontext F
+ setdomainname F
+ setegid F
+ setenv F
+ seteuid F
+ setfsent F
+ setfsgid F
+ setfsuid F
+ setgid F
+ setgrent F
+ setgroups F
+ sethostent F
+ sethostid F
+ sethostname F
+ setipv4sourcefilter F
+ setitimer F
+ setjmp F
+ setlinebuf F
+ setlocale F
+ setlogin F
+ setlogmask F
+ setmntent F
+ setnetent F
+ setnetgrent F
+ setns F
+ setpgid F
+ setpgrp F
+ setpriority F
+ setprotoent F
+ setpwent F
+ setregid F
+ setresgid F
+ setresuid F
+ setreuid F
+ setrlimit F
+ setrlimit64 F
+ setrpcent F
+ setservent F
+ setsgent F
+ setsid F
+ setsockopt F
+ setsourcefilter F
+ setspent F
+ setstate F
+ setstate_r F
+ settimeofday F
+ setttyent F
+ setuid F
+ setusershell F
+ setutent F
+ setutxent F
+ setvbuf F
+ setxattr F
+ sgetsgent F
+ sgetsgent_r F
+ sgetspent F
+ sgetspent_r F
+ shmat F
+ shmctl F
+ shmdt F
+ shmget F
+ shutdown F
+ sigaction F
+ sigaddset F
+ sigaltstack F
+ sigandset F
+ sigblock F
+ sigdelset F
+ sigemptyset F
+ sigfillset F
+ siggetmask F
+ sighold F
+ sigignore F
+ siginterrupt F
+ sigisemptyset F
+ sigismember F
+ siglongjmp F
+ signal F
+ signalfd F
+ sigorset F
+ sigpause F
+ sigpending F
+ sigprocmask F
+ sigqueue F
+ sigrelse F
+ sigreturn F
+ sigset F
+ sigsetmask F
+ sigstack F
+ sigsuspend F
+ sigtimedwait F
+ sigvec F
+ sigwait F
+ sigwaitinfo F
+ sleep F
+ snprintf F
+ sockatmark F
+ socket F
+ socketpair F
+ splice F
+ sprintf F
+ sprofil F
+ srand F
+ srand48 F
+ srand48_r F
+ srandom F
+ srandom_r F
+ sscanf F
+ ssignal F
+ sstk F
+ statfs F
+ statfs64 F
+ statvfs F
+ statvfs64 F
+ stderr D 0x8
+ stdin D 0x8
+ stdout D 0x8
+ step F
+ stime F
+ stpcpy F
+ stpncpy F
+ strcasecmp F
+ strcasecmp_l F
+ strcasestr F
+ strcat F
+ strchr F
+ strchrnul F
+ strcmp F
+ strcoll F
+ strcoll_l F
+ strcpy F
+ strcspn F
+ strdup F
+ strerror F
+ strerror_l F
+ strerror_r F
+ strfmon F
+ strfmon_l F
+ strfry F
+ strftime F
+ strftime_l F
+ strlen F
+ strncasecmp F
+ strncasecmp_l F
+ strncat F
+ strncmp F
+ strncpy F
+ strndup F
+ strnlen F
+ strpbrk F
+ strptime F
+ strptime_l F
+ strrchr F
+ strsep F
+ strsignal F
+ strspn F
+ strstr F
+ strtod F
+ strtod_l F
+ strtof F
+ strtof_l F
+ strtoimax F
+ strtok F
+ strtok_r F
+ strtol F
+ strtol_l F
+ strtold F
+ strtold_l F
+ strtoll F
+ strtoll_l F
+ strtoq F
+ strtoul F
+ strtoul_l F
+ strtoull F
+ strtoull_l F
+ strtoumax F
+ strtouq F
+ strverscmp F
+ strxfrm F
+ strxfrm_l F
+ stty F
+ svc_exit F
+ svc_fdset D 0x80
+ svc_getreq F
+ svc_getreq_common F
+ svc_getreq_poll F
+ svc_getreqset F
+ svc_max_pollfd D 0x4
+ svc_pollfd D 0x8
+ svc_register F
+ svc_run F
+ svc_sendreply F
+ svc_unregister F
+ svcauthdes_stats D 0x18
+ svcerr_auth F
+ svcerr_decode F
+ svcerr_noproc F
+ svcerr_noprog F
+ svcerr_progvers F
+ svcerr_systemerr F
+ svcerr_weakauth F
+ svcfd_create F
+ svcraw_create F
+ svctcp_create F
+ svcudp_bufcreate F
+ svcudp_create F
+ svcudp_enablecache F
+ svcunix_create F
+ svcunixfd_create F
+ swab F
+ swapcontext F
+ swapoff F
+ swapon F
+ swprintf F
+ swscanf F
+ symlink F
+ symlinkat F
+ sync F
+ sync_file_range F
+ syncfs F
+ sys_errlist D 0x3f0
+ sys_errlist D 0x420
+ sys_errlist D 0x438
+ sys_nerr D 0x4
+ sys_sigabbrev D 0x200
+ sys_sigabbrev D 0x208
+ sys_siglist D 0x200
+ sys_siglist D 0x208
+ syscall F
+ sysconf F
+ sysctl F
+ sysinfo F
+ syslog F
+ system F
+ sysv_signal F
+ tcdrain F
+ tcflow F
+ tcflush F
+ tcgetattr F
+ tcgetpgrp F
+ tcgetsid F
+ tcsendbreak F
+ tcsetattr F
+ tcsetpgrp F
+ tdelete F
+ tdestroy F
+ tee F
+ telldir F
+ tempnam F
+ textdomain F
+ tfind F
+ time F
+ timegm F
+ timelocal F
+ timerfd_create F
+ timerfd_gettime F
+ timerfd_settime F
+ times F
+ timespec_get F
+ timezone D 0x8
+ tmpfile F
+ tmpfile64 F
+ tmpnam F
+ tmpnam_r F
+ toascii F
+ tolower F
+ tolower_l F
+ toupper F
+ toupper_l F
+ towctrans F
+ towctrans_l F
+ towlower F
+ towlower_l F
+ towupper F
+ towupper_l F
+ tr_break F
+ truncate F
+ truncate64 F
+ tsearch F
+ ttyname F
+ ttyname_r F
+ ttyslot F
+ twalk F
+ tzname D 0x10
+ tzset F
+ ualarm F
+ ulckpwdf F
+ ulimit F
+ umask F
+ umount F
+ umount2 F
+ uname F
+ ungetc F
+ ungetwc F
+ unlink F
+ unlinkat F
+ unlockpt F
+ unsetenv F
+ unshare F
+ updwtmp F
+ updwtmpx F
+ uselib F
+ uselocale F
+ user2netname F
+ usleep F
+ ustat F
+ utime F
+ utimensat F
+ utimes F
+ utmpname F
+ utmpxname F
+ valloc F
+ vasprintf F
+ vdprintf F
+ verr F
+ verrx F
+ versionsort F
+ versionsort64 F
+ vfork F
+ vfprintf F
+ vfscanf F
+ vfwprintf F
+ vfwscanf F
+ vhangup F
+ vlimit F
+ vmsplice F
+ vprintf F
+ vscanf F
+ vsnprintf F
+ vsprintf F
+ vsscanf F
+ vswprintf F
+ vswscanf F
+ vsyslog F
+ vtimes F
+ vwarn F
+ vwarnx F
+ vwprintf F
+ vwscanf F
+ wait F
+ wait3 F
+ wait4 F
+ waitid F
+ waitpid F
+ warn F
+ warnx F
+ wcpcpy F
+ wcpncpy F
+ wcrtomb F
+ wcscasecmp F
+ wcscasecmp_l F
+ wcscat F
+ wcschr F
+ wcschrnul F
+ wcscmp F
+ wcscoll F
+ wcscoll_l F
+ wcscpy F
+ wcscspn F
+ wcsdup F
+ wcsftime F
+ wcsftime_l F
+ wcslen F
+ wcsncasecmp F
+ wcsncasecmp_l F
+ wcsncat F
+ wcsncmp F
+ wcsncpy F
+ wcsnlen F
+ wcsnrtombs F
+ wcspbrk F
+ wcsrchr F
+ wcsrtombs F
+ wcsspn F
+ wcsstr F
+ wcstod F
+ wcstod_l F
+ wcstof F
+ wcstof_l F
+ wcstoimax F
+ wcstok F
+ wcstol F
+ wcstol_l F
+ wcstold F
+ wcstold_l F
+ wcstoll F
+ wcstoll_l F
+ wcstombs F
+ wcstoq F
+ wcstoul F
+ wcstoul_l F
+ wcstoull F
+ wcstoull_l F
+ wcstoumax F
+ wcstouq F
+ wcswcs F
+ wcswidth F
+ wcsxfrm F
+ wcsxfrm_l F
+ wctob F
+ wctomb F
+ wctrans F
+ wctrans_l F
+ wctype F
+ wctype_l F
+ wcwidth F
+ wmemchr F
+ wmemcmp F
+ wmemcpy F
+ wmemmove F
+ wmempcpy F
+ wmemset F
+ wordexp F
+ wordfree F
+ wprintf F
+ write F
+ writev F
+ wscanf F
+ xdecrypt F
+ xdr_accepted_reply F
+ xdr_array F
+ xdr_authdes_cred F
+ xdr_authdes_verf F
+ xdr_authunix_parms F
+ xdr_bool F
+ xdr_bytes F
+ xdr_callhdr F
+ xdr_callmsg F
+ xdr_char F
+ xdr_cryptkeyarg F
+ xdr_cryptkeyarg2 F
+ xdr_cryptkeyres F
+ xdr_des_block F
+ xdr_double F
+ xdr_enum F
+ xdr_float F
+ xdr_free F
+ xdr_getcredres F
+ xdr_hyper F
+ xdr_int F
+ xdr_int16_t F
+ xdr_int32_t F
+ xdr_int64_t F
+ xdr_int8_t F
+ xdr_key_netstarg F
+ xdr_key_netstres F
+ xdr_keybuf F
+ xdr_keystatus F
+ xdr_long F
+ xdr_longlong_t F
+ xdr_netnamestr F
+ xdr_netobj F
+ xdr_opaque F
+ xdr_opaque_auth F
+ xdr_pmap F
+ xdr_pmaplist F
+ xdr_pointer F
+ xdr_quad_t F
+ xdr_reference F
+ xdr_rejected_reply F
+ xdr_replymsg F
+ xdr_rmtcall_args F
+ xdr_rmtcallres F
+ xdr_short F
+ xdr_sizeof F
+ xdr_string F
+ xdr_u_char F
+ xdr_u_hyper F
+ xdr_u_int F
+ xdr_u_long F
+ xdr_u_longlong_t F
+ xdr_u_quad_t F
+ xdr_u_short F
+ xdr_uint16_t F
+ xdr_uint32_t F
+ xdr_uint64_t F
+ xdr_uint8_t F
+ xdr_union F
+ xdr_unixcred F
+ xdr_vector F
+ xdr_void F
+ xdr_wrapstring F
+ xdrmem_create F
+ xdrrec_create F
+ xdrrec_endofrecord F
+ xdrrec_eof F
+ xdrrec_skiprecord F
+ xdrstdio_create F
+ xencrypt F
+ xprt_register F
+ xprt_unregister F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libcrypt.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libcrypt.abilist
new file mode 100644
index 0000000..09d7dfd
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libcrypt.abilist
@@ -0,0 +1,9 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ crypt F
+ crypt_r F
+ encrypt F
+ encrypt_r F
+ fcrypt F
+ setkey F
+ setkey_r F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libdl.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libdl.abilist
new file mode 100644
index 0000000..d9b0430
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libdl.abilist
@@ -0,0 +1,11 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ dladdr F
+ dladdr1 F
+ dlclose F
+ dlerror F
+ dlinfo F
+ dlmopen F
+ dlopen F
+ dlsym F
+ dlvsym F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libm.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libm.abilist
new file mode 100644
index 0000000..4976d99
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libm.abilist
@@ -0,0 +1,405 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ _LIB_VERSION D 0x4
+ __acos_finite F
+ __acosf_finite F
+ __acosh_finite F
+ __acoshf_finite F
+ __acoshl_finite F
+ __acosl_finite F
+ __asin_finite F
+ __asinf_finite F
+ __asinl_finite F
+ __atan2_finite F
+ __atan2f_finite F
+ __atan2l_finite F
+ __atanh_finite F
+ __atanhf_finite F
+ __atanhl_finite F
+ __clog10 F
+ __clog10f F
+ __clog10l F
+ __cosh_finite F
+ __coshf_finite F
+ __coshl_finite F
+ __exp10_finite F
+ __exp10f_finite F
+ __exp10l_finite F
+ __exp2_finite F
+ __exp2f_finite F
+ __exp2l_finite F
+ __exp_finite F
+ __expf_finite F
+ __expl_finite F
+ __fe_dfl_env D 0x8
+ __fe_enabled_env D 0x8
+ __fe_nomask_env F
+ __fe_nonieee_env D 0x8
+ __finite F
+ __finitef F
+ __finitel F
+ __fmod_finite F
+ __fmodf_finite F
+ __fmodl_finite F
+ __fpclassify F
+ __fpclassifyf F
+ __fpclassifyl F
+ __gamma_r_finite F
+ __gammaf_r_finite F
+ __gammal_r_finite F
+ __hypot_finite F
+ __hypotf_finite F
+ __hypotl_finite F
+ __issignaling F
+ __issignalingf F
+ __issignalingl F
+ __j0_finite F
+ __j0f_finite F
+ __j0l_finite F
+ __j1_finite F
+ __j1f_finite F
+ __j1l_finite F
+ __jn_finite F
+ __jnf_finite F
+ __jnl_finite F
+ __lgamma_r_finite F
+ __lgammaf_r_finite F
+ __lgammal_r_finite F
+ __log10_finite F
+ __log10f_finite F
+ __log10l_finite F
+ __log2_finite F
+ __log2f_finite F
+ __log2l_finite F
+ __log_finite F
+ __logf_finite F
+ __logl_finite F
+ __nldbl_nexttowardf F
+ __pow_finite F
+ __powf_finite F
+ __powl_finite F
+ __remainder_finite F
+ __remainderf_finite F
+ __remainderl_finite F
+ __scalb_finite F
+ __scalbf_finite F
+ __scalbl_finite F
+ __signbit F
+ __signbitf F
+ __signbitl F
+ __sinh_finite F
+ __sinhf_finite F
+ __sinhl_finite F
+ __sqrt_finite F
+ __sqrtf_finite F
+ __sqrtl_finite F
+ __y0_finite F
+ __y0f_finite F
+ __y0l_finite F
+ __y1_finite F
+ __y1f_finite F
+ __y1l_finite F
+ __yn_finite F
+ __ynf_finite F
+ __ynl_finite F
+ acos F
+ acosf F
+ acosh F
+ acoshf F
+ acoshl F
+ acosl F
+ asin F
+ asinf F
+ asinh F
+ asinhf F
+ asinhl F
+ asinl F
+ atan F
+ atan2 F
+ atan2f F
+ atan2l F
+ atanf F
+ atanh F
+ atanhf F
+ atanhl F
+ atanl F
+ cabs F
+ cabsf F
+ cabsl F
+ cacos F
+ cacosf F
+ cacosh F
+ cacoshf F
+ cacoshl F
+ cacosl F
+ carg F
+ cargf F
+ cargl F
+ casin F
+ casinf F
+ casinh F
+ casinhf F
+ casinhl F
+ casinl F
+ catan F
+ catanf F
+ catanh F
+ catanhf F
+ catanhl F
+ catanl F
+ cbrt F
+ cbrtf F
+ cbrtl F
+ ccos F
+ ccosf F
+ ccosh F
+ ccoshf F
+ ccoshl F
+ ccosl F
+ ceil F
+ ceilf F
+ ceill F
+ cexp F
+ cexpf F
+ cexpl F
+ cimag F
+ cimagf F
+ cimagl F
+ clog F
+ clog10 F
+ clog10f F
+ clog10l F
+ clogf F
+ clogl F
+ conj F
+ conjf F
+ conjl F
+ copysign F
+ copysignf F
+ copysignl F
+ cos F
+ cosf F
+ cosh F
+ coshf F
+ coshl F
+ cosl F
+ cpow F
+ cpowf F
+ cpowl F
+ cproj F
+ cprojf F
+ cprojl F
+ creal F
+ crealf F
+ creall F
+ csin F
+ csinf F
+ csinh F
+ csinhf F
+ csinhl F
+ csinl F
+ csqrt F
+ csqrtf F
+ csqrtl F
+ ctan F
+ ctanf F
+ ctanh F
+ ctanhf F
+ ctanhl F
+ ctanl F
+ drem F
+ dremf F
+ dreml F
+ erf F
+ erfc F
+ erfcf F
+ erfcl F
+ erff F
+ erfl F
+ exp F
+ exp10 F
+ exp10f F
+ exp10l F
+ exp2 F
+ exp2f F
+ exp2l F
+ expf F
+ expl F
+ expm1 F
+ expm1f F
+ expm1l F
+ fabs F
+ fabsf F
+ fabsl F
+ fdim F
+ fdimf F
+ fdiml F
+ feclearexcept F
+ fedisableexcept F
+ feenableexcept F
+ fegetenv F
+ fegetexcept F
+ fegetexceptflag F
+ fegetround F
+ feholdexcept F
+ feraiseexcept F
+ fesetenv F
+ fesetexceptflag F
+ fesetround F
+ fetestexcept F
+ feupdateenv F
+ finite F
+ finitef F
+ finitel F
+ floor F
+ floorf F
+ floorl F
+ fma F
+ fmaf F
+ fmal F
+ fmax F
+ fmaxf F
+ fmaxl F
+ fmin F
+ fminf F
+ fminl F
+ fmod F
+ fmodf F
+ fmodl F
+ frexp F
+ frexpf F
+ frexpl F
+ gamma F
+ gammaf F
+ gammal F
+ hypot F
+ hypotf F
+ hypotl F
+ ilogb F
+ ilogbf F
+ ilogbl F
+ j0 F
+ j0f F
+ j0l F
+ j1 F
+ j1f F
+ j1l F
+ jn F
+ jnf F
+ jnl F
+ ldexp F
+ ldexpf F
+ ldexpl F
+ lgamma F
+ lgamma_r F
+ lgammaf F
+ lgammaf_r F
+ lgammal F
+ lgammal_r F
+ llrint F
+ llrintf F
+ llrintl F
+ llround F
+ llroundf F
+ llroundl F
+ log F
+ log10 F
+ log10f F
+ log10l F
+ log1p F
+ log1pf F
+ log1pl F
+ log2 F
+ log2f F
+ log2l F
+ logb F
+ logbf F
+ logbl F
+ logf F
+ logl F
+ lrint F
+ lrintf F
+ lrintl F
+ lround F
+ lroundf F
+ lroundl F
+ matherr F
+ modf F
+ modff F
+ modfl F
+ nan F
+ nanf F
+ nanl F
+ nearbyint F
+ nearbyintf F
+ nearbyintl F
+ nextafter F
+ nextafterf F
+ nextafterl F
+ nexttoward F
+ nexttowardf F
+ nexttowardl F
+ pow F
+ pow10 F
+ pow10f F
+ pow10l F
+ powf F
+ powl F
+ remainder F
+ remainderf F
+ remainderl F
+ remquo F
+ remquof F
+ remquol F
+ rint F
+ rintf F
+ rintl F
+ round F
+ roundf F
+ roundl F
+ scalb F
+ scalbf F
+ scalbl F
+ scalbln F
+ scalblnf F
+ scalblnl F
+ scalbn F
+ scalbnf F
+ scalbnl F
+ signgam D 0x4
+ significand F
+ significandf F
+ significandl F
+ sin F
+ sincos F
+ sincosf F
+ sincosl F
+ sinf F
+ sinh F
+ sinhf F
+ sinhl F
+ sinl F
+ sqrt F
+ sqrtf F
+ sqrtl F
+ tan F
+ tanf F
+ tanh F
+ tanhf F
+ tanhl F
+ tanl F
+ tgamma F
+ tgammaf F
+ tgammal F
+ trunc F
+ truncf F
+ truncl F
+ y0 F
+ y0f F
+ y0l F
+ y1 F
+ y1f F
+ y1l F
+ yn F
+ ynf F
+ ynl F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libnsl.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libnsl.abilist
new file mode 100644
index 0000000..b451ebd
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libnsl.abilist
@@ -0,0 +1,123 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ __free_fdresult F
+ __nis_default_access F
+ __nis_default_group F
+ __nis_default_owner F
+ __nis_default_ttl F
+ __nis_finddirectory F
+ __nis_hash F
+ __nisbind_connect F
+ __nisbind_create F
+ __nisbind_destroy F
+ __nisbind_next F
+ __yp_check F
+ nis_add F
+ nis_add_entry F
+ nis_addmember F
+ nis_checkpoint F
+ nis_clone_directory F
+ nis_clone_object F
+ nis_clone_result F
+ nis_creategroup F
+ nis_destroy_object F
+ nis_destroygroup F
+ nis_dir_cmp F
+ nis_domain_of F
+ nis_domain_of_r F
+ nis_first_entry F
+ nis_free_directory F
+ nis_free_object F
+ nis_free_request F
+ nis_freenames F
+ nis_freeresult F
+ nis_freeservlist F
+ nis_freetags F
+ nis_getnames F
+ nis_getservlist F
+ nis_ismember F
+ nis_leaf_of F
+ nis_leaf_of_r F
+ nis_lerror F
+ nis_list F
+ nis_local_directory F
+ nis_local_group F
+ nis_local_host F
+ nis_local_principal F
+ nis_lookup F
+ nis_mkdir F
+ nis_modify F
+ nis_modify_entry F
+ nis_name_of F
+ nis_name_of_r F
+ nis_next_entry F
+ nis_perror F
+ nis_ping F
+ nis_print_directory F
+ nis_print_entry F
+ nis_print_group F
+ nis_print_group_entry F
+ nis_print_link F
+ nis_print_object F
+ nis_print_result F
+ nis_print_rights F
+ nis_print_table F
+ nis_read_obj F
+ nis_remove F
+ nis_remove_entry F
+ nis_removemember F
+ nis_rmdir F
+ nis_servstate F
+ nis_sperrno F
+ nis_sperror F
+ nis_sperror_r F
+ nis_stats F
+ nis_verifygroup F
+ nis_write_obj F
+ readColdStartFile F
+ writeColdStartFile F
+ xdr_cback_data F
+ xdr_domainname F
+ xdr_keydat F
+ xdr_mapname F
+ xdr_obj_p F
+ xdr_peername F
+ xdr_valdat F
+ xdr_yp_buf F
+ xdr_ypall F
+ xdr_ypbind_binding F
+ xdr_ypbind_resp F
+ xdr_ypbind_resptype F
+ xdr_ypbind_setdom F
+ xdr_ypdelete_args F
+ xdr_ypmap_parms F
+ xdr_ypmaplist F
+ xdr_yppush_status F
+ xdr_yppushresp_xfr F
+ xdr_ypreq_key F
+ xdr_ypreq_nokey F
+ xdr_ypreq_xfr F
+ xdr_ypresp_all F
+ xdr_ypresp_key_val F
+ xdr_ypresp_maplist F
+ xdr_ypresp_master F
+ xdr_ypresp_order F
+ xdr_ypresp_val F
+ xdr_ypresp_xfr F
+ xdr_ypstat F
+ xdr_ypupdate_args F
+ xdr_ypxfrstat F
+ yp_all F
+ yp_bind F
+ yp_first F
+ yp_get_default_domain F
+ yp_maplist F
+ yp_master F
+ yp_match F
+ yp_next F
+ yp_order F
+ yp_unbind F
+ yp_update F
+ ypbinderr_string F
+ yperr_string F
+ ypprot_err F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libpthread.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libpthread.abilist
new file mode 100644
index 0000000..2cca30a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libpthread.abilist
@@ -0,0 +1,226 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ _IO_flockfile F
+ _IO_ftrylockfile F
+ _IO_funlockfile F
+ __close F
+ __connect F
+ __errno_location F
+ __fcntl F
+ __fork F
+ __h_errno_location F
+ __libc_allocate_rtsig F
+ __libc_current_sigrtmax F
+ __libc_current_sigrtmin F
+ __lseek F
+ __nanosleep F
+ __open F
+ __open64 F
+ __pread64 F
+ __pthread_cleanup_routine F
+ __pthread_getspecific F
+ __pthread_key_create F
+ __pthread_mutex_destroy F
+ __pthread_mutex_init F
+ __pthread_mutex_lock F
+ __pthread_mutex_trylock F
+ __pthread_mutex_unlock F
+ __pthread_mutexattr_destroy F
+ __pthread_mutexattr_init F
+ __pthread_mutexattr_settype F
+ __pthread_once F
+ __pthread_register_cancel F
+ __pthread_register_cancel_defer F
+ __pthread_rwlock_destroy F
+ __pthread_rwlock_init F
+ __pthread_rwlock_rdlock F
+ __pthread_rwlock_tryrdlock F
+ __pthread_rwlock_trywrlock F
+ __pthread_rwlock_unlock F
+ __pthread_rwlock_wrlock F
+ __pthread_setspecific F
+ __pthread_unregister_cancel F
+ __pthread_unregister_cancel_restore F
+ __pthread_unwind_next F
+ __pwrite64 F
+ __read F
+ __res_state F
+ __send F
+ __sigaction F
+ __vfork F
+ __wait F
+ __write F
+ _pthread_cleanup_pop F
+ _pthread_cleanup_pop_restore F
+ _pthread_cleanup_push F
+ _pthread_cleanup_push_defer F
+ accept F
+ close F
+ connect F
+ fcntl F
+ flockfile F
+ fork F
+ fsync F
+ ftrylockfile F
+ funlockfile F
+ longjmp F
+ lseek F
+ lseek64 F
+ msync F
+ nanosleep F
+ open F
+ open64 F
+ pause F
+ pread F
+ pread64 F
+ pthread_attr_destroy F
+ pthread_attr_getaffinity_np F
+ pthread_attr_getdetachstate F
+ pthread_attr_getguardsize F
+ pthread_attr_getinheritsched F
+ pthread_attr_getschedparam F
+ pthread_attr_getschedpolicy F
+ pthread_attr_getscope F
+ pthread_attr_getstack F
+ pthread_attr_getstackaddr F
+ pthread_attr_getstacksize F
+ pthread_attr_init F
+ pthread_attr_setaffinity_np F
+ pthread_attr_setdetachstate F
+ pthread_attr_setguardsize F
+ pthread_attr_setinheritsched F
+ pthread_attr_setschedparam F
+ pthread_attr_setschedpolicy F
+ pthread_attr_setscope F
+ pthread_attr_setstack F
+ pthread_attr_setstackaddr F
+ pthread_attr_setstacksize F
+ pthread_barrier_destroy F
+ pthread_barrier_init F
+ pthread_barrier_wait F
+ pthread_barrierattr_destroy F
+ pthread_barrierattr_getpshared F
+ pthread_barrierattr_init F
+ pthread_barrierattr_setpshared F
+ pthread_cancel F
+ pthread_cond_broadcast F
+ pthread_cond_destroy F
+ pthread_cond_init F
+ pthread_cond_signal F
+ pthread_cond_timedwait F
+ pthread_cond_wait F
+ pthread_condattr_destroy F
+ pthread_condattr_getclock F
+ pthread_condattr_getpshared F
+ pthread_condattr_init F
+ pthread_condattr_setclock F
+ pthread_condattr_setpshared F
+ pthread_create F
+ pthread_detach F
+ pthread_equal F
+ pthread_exit F
+ pthread_getaffinity_np F
+ pthread_getattr_default_np F
+ pthread_getattr_np F
+ pthread_getconcurrency F
+ pthread_getcpuclockid F
+ pthread_getname_np F
+ pthread_getschedparam F
+ pthread_getspecific F
+ pthread_join F
+ pthread_key_create F
+ pthread_key_delete F
+ pthread_kill F
+ pthread_kill_other_threads_np F
+ pthread_mutex_consistent F
+ pthread_mutex_consistent_np F
+ pthread_mutex_destroy F
+ pthread_mutex_getprioceiling F
+ pthread_mutex_init F
+ pthread_mutex_lock F
+ pthread_mutex_setprioceiling F
+ pthread_mutex_timedlock F
+ pthread_mutex_trylock F
+ pthread_mutex_unlock F
+ pthread_mutexattr_destroy F
+ pthread_mutexattr_getkind_np F
+ pthread_mutexattr_getprioceiling F
+ pthread_mutexattr_getprotocol F
+ pthread_mutexattr_getpshared F
+ pthread_mutexattr_getrobust F
+ pthread_mutexattr_getrobust_np F
+ pthread_mutexattr_gettype F
+ pthread_mutexattr_init F
+ pthread_mutexattr_setkind_np F
+ pthread_mutexattr_setprioceiling F
+ pthread_mutexattr_setprotocol F
+ pthread_mutexattr_setpshared F
+ pthread_mutexattr_setrobust F
+ pthread_mutexattr_setrobust_np F
+ pthread_mutexattr_settype F
+ pthread_once F
+ pthread_rwlock_destroy F
+ pthread_rwlock_init F
+ pthread_rwlock_rdlock F
+ pthread_rwlock_timedrdlock F
+ pthread_rwlock_timedwrlock F
+ pthread_rwlock_tryrdlock F
+ pthread_rwlock_trywrlock F
+ pthread_rwlock_unlock F
+ pthread_rwlock_wrlock F
+ pthread_rwlockattr_destroy F
+ pthread_rwlockattr_getkind_np F
+ pthread_rwlockattr_getpshared F
+ pthread_rwlockattr_init F
+ pthread_rwlockattr_setkind_np F
+ pthread_rwlockattr_setpshared F
+ pthread_self F
+ pthread_setaffinity_np F
+ pthread_setattr_default_np F
+ pthread_setcancelstate F
+ pthread_setcanceltype F
+ pthread_setconcurrency F
+ pthread_setname_np F
+ pthread_setschedparam F
+ pthread_setschedprio F
+ pthread_setspecific F
+ pthread_sigmask F
+ pthread_sigqueue F
+ pthread_spin_destroy F
+ pthread_spin_init F
+ pthread_spin_lock F
+ pthread_spin_trylock F
+ pthread_spin_unlock F
+ pthread_testcancel F
+ pthread_timedjoin_np F
+ pthread_tryjoin_np F
+ pthread_yield F
+ pwrite F
+ pwrite64 F
+ raise F
+ read F
+ recv F
+ recvfrom F
+ recvmsg F
+ sem_close F
+ sem_destroy F
+ sem_getvalue F
+ sem_init F
+ sem_open F
+ sem_post F
+ sem_timedwait F
+ sem_trywait F
+ sem_unlink F
+ sem_wait F
+ send F
+ sendmsg F
+ sendto F
+ sigaction F
+ siglongjmp F
+ sigwait F
+ system F
+ tcdrain F
+ vfork F
+ wait F
+ waitpid F
+ write F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libresolv.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libresolv.abilist
new file mode 100644
index 0000000..3edaa45
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libresolv.abilist
@@ -0,0 +1,93 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ __b64_ntop F
+ __b64_pton F
+ __dn_comp F
+ __dn_count_labels F
+ __dn_expand F
+ __dn_skipname F
+ __fp_nquery F
+ __fp_query F
+ __fp_resstat F
+ __hostalias F
+ __loc_aton F
+ __loc_ntoa F
+ __p_cdname F
+ __p_cdnname F
+ __p_class F
+ __p_class_syms D 0xa8
+ __p_fqname F
+ __p_fqnname F
+ __p_option F
+ __p_query F
+ __p_rcode F
+ __p_secstodate F
+ __p_time F
+ __p_type F
+ __p_type_syms D 0x450
+ __putlong F
+ __putshort F
+ __res_close F
+ __res_dnok F
+ __res_hnok F
+ __res_hostalias F
+ __res_isourserver F
+ __res_mailok F
+ __res_mkquery F
+ __res_nameinquery F
+ __res_nmkquery F
+ __res_nquery F
+ __res_nquerydomain F
+ __res_nsearch F
+ __res_nsend F
+ __res_ownok F
+ __res_queriesmatch F
+ __res_query F
+ __res_querydomain F
+ __res_search F
+ __res_send F
+ __sym_ntop F
+ __sym_ntos F
+ __sym_ston F
+ _gethtbyaddr F
+ _gethtbyname F
+ _gethtbyname2 F
+ _gethtent F
+ _getlong F
+ _getshort F
+ _res_opcodes D 0x80
+ _sethtent F
+ inet_net_ntop F
+ inet_net_pton F
+ inet_neta F
+ ns_datetosecs F
+ ns_format_ttl F
+ ns_get16 F
+ ns_get32 F
+ ns_initparse F
+ ns_makecanon F
+ ns_msg_getflag F
+ ns_name_compress F
+ ns_name_ntol F
+ ns_name_ntop F
+ ns_name_pack F
+ ns_name_pton F
+ ns_name_rollback F
+ ns_name_skip F
+ ns_name_uncompress F
+ ns_name_unpack F
+ ns_parse_ttl F
+ ns_parserr F
+ ns_put16 F
+ ns_put32 F
+ ns_samedomain F
+ ns_samename F
+ ns_skiprr F
+ ns_sprintrr F
+ ns_sprintrrf F
+ ns_subdomain F
+ res_gethostbyaddr F
+ res_gethostbyname F
+ res_gethostbyname2 F
+ res_send_setqhook F
+ res_send_setrhook F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/librt.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/librt.abilist
new file mode 100644
index 0000000..2d68c86
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/librt.abilist
@@ -0,0 +1,42 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ __mq_open_2 F
+ aio_cancel F
+ aio_cancel64 F
+ aio_error F
+ aio_error64 F
+ aio_fsync F
+ aio_fsync64 F
+ aio_init F
+ aio_read F
+ aio_read64 F
+ aio_return F
+ aio_return64 F
+ aio_suspend F
+ aio_suspend64 F
+ aio_write F
+ aio_write64 F
+ clock_getcpuclockid F
+ clock_getres F
+ clock_gettime F
+ clock_nanosleep F
+ clock_settime F
+ lio_listio F
+ lio_listio64 F
+ mq_close F
+ mq_getattr F
+ mq_notify F
+ mq_open F
+ mq_receive F
+ mq_send F
+ mq_setattr F
+ mq_timedreceive F
+ mq_timedsend F
+ mq_unlink F
+ shm_open F
+ shm_unlink F
+ timer_create F
+ timer_delete F
+ timer_getoverrun F
+ timer_gettime F
+ timer_settime F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libthread_db.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libthread_db.abilist
new file mode 100644
index 0000000..77b1c63
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libthread_db.abilist
@@ -0,0 +1,42 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ td_init F
+ td_log F
+ td_symbol_list F
+ td_ta_clear_event F
+ td_ta_delete F
+ td_ta_enable_stats F
+ td_ta_event_addr F
+ td_ta_event_getmsg F
+ td_ta_get_nthreads F
+ td_ta_get_ph F
+ td_ta_get_stats F
+ td_ta_map_id2thr F
+ td_ta_map_lwp2thr F
+ td_ta_new F
+ td_ta_reset_stats F
+ td_ta_set_event F
+ td_ta_setconcurrency F
+ td_ta_thr_iter F
+ td_ta_tsd_iter F
+ td_thr_clear_event F
+ td_thr_dbresume F
+ td_thr_dbsuspend F
+ td_thr_event_enable F
+ td_thr_event_getmsg F
+ td_thr_get_info F
+ td_thr_getfpregs F
+ td_thr_getgregs F
+ td_thr_getxregs F
+ td_thr_getxregsize F
+ td_thr_set_event F
+ td_thr_setfpregs F
+ td_thr_setgregs F
+ td_thr_setprio F
+ td_thr_setsigpending F
+ td_thr_setxregs F
+ td_thr_sigsetmask F
+ td_thr_tls_get_addr F
+ td_thr_tlsbase F
+ td_thr_tsd F
+ td_thr_validate F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libutil.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libutil.abilist
new file mode 100644
index 0000000..2d36813
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/nptl/libutil.abilist
@@ -0,0 +1,8 @@
+GLIBC_2.19
+ GLIBC_2.19 A
+ forkpty F
+ login F
+ login_tty F
+ logout F
+ logwtmp F
+ openpty F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power4/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power4/Implies
new file mode 100644
index 0000000..a20cbd6
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power4/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power4
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5+/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5+/Implies
new file mode 100644
index 0000000..21a7f12
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5+/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power5+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5/Implies
new file mode 100644
index 0000000..bd3117f
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power5/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power5
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6/Implies
new file mode 100644
index 0000000..b95d6c1
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power6
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6x/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6x/Implies
new file mode 100644
index 0000000..0b759c4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power6x/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power6x
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power7/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power7/Implies
new file mode 100644
index 0000000..d4f8b87
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power7/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power7
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power8/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power8/Implies
new file mode 100644
index 0000000..38d17b4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64le/power8/Implies
@@ -0,0 +1 @@
+unix/sysv/linux/powerpc/powerpc64/power8

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a794b5c130783714b040f5c3334e3a34f1614c2c

commit a794b5c130783714b040f5c3334e3a34f1614c2c
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:48:07 2013 -0500

    PowerPC: A little more precise %Lf for IBM long double
    
    I'd like to print IBM long doubles with a few extra bits of precision,
    because IBM long double has variable precision and we often have
    values with 107 bits during the normal course of calculations.  It's
    possible to have an IBM long double with 2k bits of precision, and
    while some might argue we ought to print to the full precision, I'm
    not inclined to try to implement that.
    
    So this patch gives the mpn values returned from ldbl2mpn() an extra
    11 bits of precision.  To do that it's necessary to inform the caller
    of __mpn_extract_long_double() exactly how many bits are returned
    rather than assuming a fixed LDBL_MANT_DIG bits.  I did that
    indirectly by returning the number of zero bits in the last mp_limb,
    which turns out to be convenient both in __mpn_extract_long_double(),
    and it's caller.  Given the extra parameter it then becomes possible
    to omit shifting in __mpn_extract_long_double(), tidying the code a
    little, since the caller does shifting anyway.
    
    In the following, note that 1 - IEEE854_LONG_DOUBLE_BIAS is equal to
    LDBL_MIN_EXP - 1.  I prefer the former since we already use
    IEEE854_LONG_DOUBLE_BIAS in the exponent calculation for normalised
    values, and it reinforces the fact that denormals are treated as if
    their unbiased exponent was 1.

diff --git a/include/gmp.h b/include/gmp.h
index b741670..bcc9360 100644
--- a/include/gmp.h
+++ b/include/gmp.h
@@ -8,12 +8,12 @@
 
 /* Now define the internal interfaces.  */
 extern mp_size_t __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
-				       int *expt, int *is_neg,
-				       double value);
+				       int *expt, int *zero_bits,
+				       int *is_neg, double value);
 
 extern mp_size_t __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-					    int *expt, int *is_neg,
-					    long double value);
+					    int *expt, int *zero_bits,
+					    int *is_neg, long double value);
 
 extern float __mpn_construct_float (mp_srcptr frac_ptr, int expt, int sign);
 
diff --git a/stdio-common/printf_fp.c b/stdio-common/printf_fp.c
index 2b93e6c..c0a7a82 100644
--- a/stdio-common/printf_fp.c
+++ b/stdio-common/printf_fp.c
@@ -134,11 +134,11 @@
   (u##size > v##size || (u##size == v##size && __mpn_cmp (u, v, u##size) >= 0))
 
 extern mp_size_t __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
-				       int *expt, int *is_neg,
-				       double value);
+				       int *expt, int *zero_bits,
+				       int *is_neg, double value);
 extern mp_size_t __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-					    int *expt, int *is_neg,
-					    long double value);
+					    int *expt, int *zero_bits,
+					    int *is_neg, long double value);
 extern unsigned int __guess_grouping (unsigned int intdig_max,
 				      const char *grouping);
 
@@ -360,12 +360,13 @@ ___printf_fp (FILE *fp,
 	}
       else
 	{
+	  int zero_bits;
 	  fracsize = __mpn_extract_long_double (fp_input,
 						(sizeof (fp_input) /
 						 sizeof (fp_input[0])),
-						&exponent, &is_neg,
+						&exponent, &zero_bits, &is_neg,
 						fpnum.ldbl);
-	  to_shift = 1 + fracsize * BITS_PER_MP_LIMB - LDBL_MANT_DIG;
+	  to_shift = 1 + zero_bits;
 	}
     }
   else
@@ -406,11 +407,13 @@ ___printf_fp (FILE *fp,
 	}
       else
 	{
+	  int zero_bits;
 	  fracsize = __mpn_extract_double (fp_input,
 					   (sizeof (fp_input)
 					    / sizeof (fp_input[0])),
-					   &exponent, &is_neg, fpnum.dbl);
-	  to_shift = 1 + fracsize * BITS_PER_MP_LIMB - DBL_MANT_DIG;
+					   &exponent, &zero_bits, &is_neg,
+					   fpnum.dbl);
+	  to_shift = 1 + zero_bits;
 	}
     }
 
diff --git a/stdlib/dbl2mpn.c b/stdlib/dbl2mpn.c
index 429e20a..dd835f3 100644
--- a/stdlib/dbl2mpn.c
+++ b/stdlib/dbl2mpn.c
@@ -24,7 +24,7 @@
 
 mp_size_t
 __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
-		      int *expt, int *is_neg,
+		      int *expt, int *zero_bits, int *is_neg,
 		      double value)
 {
 #error "__mpn_extract_double is not implemented for this floating point format"
diff --git a/sysdeps/i386/ldbl2mpn.c b/sysdeps/i386/ldbl2mpn.c
index c7b322b..968aea5 100644
--- a/sysdeps/i386/ldbl2mpn.c
+++ b/sysdeps/i386/ldbl2mpn.c
@@ -29,7 +29,7 @@
 
 mp_size_t
 __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-			   int *expt, int *is_neg,
+			   int *expt, int *zero_bits, int *is_neg,
 			   long double value)
 {
   union ieee854_long_double u;
@@ -51,18 +51,26 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
 #endif
 
+/* The format does not fill the last limb.  There are some zeros.  */
+#define NUM_LEADING_ZEROS (N * BITS_PER_MP_LIMB - LDBL_MANT_DIG)
+  *zero_bits = NUM_LEADING_ZEROS;
+
   if (u.ieee.exponent == 0)
     {
       /* A biased exponent of zero is a special case.
 	 Either it is a zero or it is a denormal number.  */
       if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2.  */
-	/* It's zero.  */
-	*expt = 0;
+	{
+	  /* It's zero.  */
+	  *expt = 0;
+	  return 1;
+	}
       else
 	{
 	  /* It is a denormal number, meaning it has no implicit leading
 	     one bit, and its exponent is in fact the format minimum.  */
 	  int cnt;
+	  int exp = 1 - IEEE854_LONG_DOUBLE_BIAS;
 
 	  /* One problem with Intel's 80-bit format is that the explicit
 	     leading one in the normalized representation has to be zero
@@ -74,24 +82,17 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 	  if (res_ptr[N - 1] != 0)
 	    {
 	      count_leading_zeros (cnt, res_ptr[N - 1]);
-	      if (cnt != 0)
-		{
-#if N == 2
-		  res_ptr[N - 1] = res_ptr[N - 1] << cnt
-				   | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
-		  res_ptr[0] <<= cnt;
-#else
-		  res_ptr[N - 1] <<= cnt;
-#endif
-		}
-	      *expt = LDBL_MIN_EXP - 1 - cnt;
+	      *zero_bits = cnt;
+	      *expt = exp + NUM_LEADING_ZEROS - cnt;
+	      return N;
 	    }
 	  else if (res_ptr[0] != 0)
 	    {
 	      count_leading_zeros (cnt, res_ptr[0]);
-	      res_ptr[N - 1] = res_ptr[0] << cnt;
-	      res_ptr[0] = 0;
-	      *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt;
+	      exp -= BITS_PER_MP_LIMB;
+	      *zero_bits = cnt;
+	      *expt = exp + NUM_LEADING_ZEROS - cnt;
+	      return 1;
 	    }
 	  else
 	    {
@@ -104,7 +105,7 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 #else
 	      res_ptr[0] = 0x8000000000000000ul;
 #endif
-	      *expt = LDBL_MIN_EXP - 1;
+	      *expt = 1 - IEEE854_LONG_DOUBLE_BIAS;
 	    }
 	}
     }
diff --git a/sysdeps/ieee754/dbl-64/dbl2mpn.c b/sysdeps/ieee754/dbl-64/dbl2mpn.c
index f2294de..cae062a 100644
--- a/sysdeps/ieee754/dbl-64/dbl2mpn.c
+++ b/sysdeps/ieee754/dbl-64/dbl2mpn.c
@@ -28,7 +28,7 @@
 
 mp_size_t
 __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
-		      int *expt, int *is_neg,
+		      int *expt, int *zero_bits, int *is_neg,
 		      double value)
 {
   union ieee754_double u;
@@ -49,9 +49,10 @@ __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
 #else
   #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
 #endif
+
 /* The format does not fill the last limb.  There are some zeros.  */
-#define NUM_LEADING_ZEROS (BITS_PER_MP_LIMB \
-			   - (DBL_MANT_DIG - ((N - 1) * BITS_PER_MP_LIMB)))
+#define NUM_LEADING_ZEROS (N * BITS_PER_MP_LIMB - DBL_MANT_DIG)
+  *zero_bits = NUM_LEADING_ZEROS;
 
   if (u.ieee.exponent == 0)
     {
@@ -65,37 +66,20 @@ __mpn_extract_double (mp_ptr res_ptr, mp_size_t size,
           /* It is a denormal number, meaning it has no implicit leading
 	     one bit, and its exponent is in fact the format minimum.  */
 	  int cnt;
+	  int exp = 1 - IEEE754_DOUBLE_BIAS;
+	  int n = N;
 
 	  if (res_ptr[N - 1] != 0)
-	    {
-	      count_leading_zeros (cnt, res_ptr[N - 1]);
-	      cnt -= NUM_LEADING_ZEROS;
-#if N == 2
-	      res_ptr[N - 1] = res_ptr[1] << cnt
-			       | (N - 1)
-			         * (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
-	      res_ptr[0] <<= cnt;
-#else
-	      res_ptr[N - 1] <<= cnt;
-#endif
-	      *expt = DBL_MIN_EXP - 1 - cnt;
-	    }
+	    count_leading_zeros (cnt, res_ptr[N - 1]);
 	  else
 	    {
 	      count_leading_zeros (cnt, res_ptr[0]);
-	      if (cnt >= NUM_LEADING_ZEROS)
-		{
-		  res_ptr[N - 1] = res_ptr[0] << (cnt - NUM_LEADING_ZEROS);
-		  res_ptr[0] = 0;
-		}
-	      else
-		{
-		  res_ptr[N - 1] = res_ptr[0] >> (NUM_LEADING_ZEROS - cnt);
-		  res_ptr[0] <<= BITS_PER_MP_LIMB - (NUM_LEADING_ZEROS - cnt);
-		}
-	      *expt = DBL_MIN_EXP - 1
-		      - (BITS_PER_MP_LIMB - NUM_LEADING_ZEROS) - cnt;
+	      exp -= BITS_PER_MP_LIMB;
+	      n = 1;
 	    }
+	  *zero_bits = cnt;
+	  *expt = exp + NUM_LEADING_ZEROS - cnt;
+	  return n;
 	}
     }
   else
diff --git a/sysdeps/ieee754/ldbl-128/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128/ldbl2mpn.c
index 64f98a5..8e2145b 100644
--- a/sysdeps/ieee754/ldbl-128/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128/ldbl2mpn.c
@@ -30,7 +30,7 @@
 
 mp_size_t
 __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-			   int *expt, int *is_neg,
+			   int *expt, int *zero_bits, int *is_neg,
 			   long double value)
 {
   union ieee854_long_double u;
@@ -54,9 +54,10 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 #else
   #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
 #endif
+
 /* The format does not fill the last limb.  There are some zeros.  */
-#define NUM_LEADING_ZEROS (BITS_PER_MP_LIMB \
-			   - (LDBL_MANT_DIG - ((N - 1) * BITS_PER_MP_LIMB)))
+#define NUM_LEADING_ZEROS (N * BITS_PER_MP_LIMB - LDBL_MANT_DIG)
+  *zero_bits = NUM_LEADING_ZEROS;
 
   if (u.ieee.exponent == 0)
     {
@@ -64,70 +65,42 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 	 Either it is a zero or it is a denormal number.  */
       if (res_ptr[0] == 0 && res_ptr[1] == 0
           && res_ptr[N - 2] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=4.  */
-	/* It's zero.  */
-	*expt = 0;
+	{
+	  /* It's zero.  */
+	  *expt = 0;
+	  return 1;
+	}
       else
 	{
           /* It is a denormal number, meaning it has no implicit leading
 	     one bit, and its exponent is in fact the format minimum.  */
 	  int cnt;
+	  int exp = 1 - IEEE854_LONG_DOUBLE_BIAS;
+	  int n = N;
 
 #if N == 2
 	  if (res_ptr[N - 1] != 0)
-	    {
-	      count_leading_zeros (cnt, res_ptr[N - 1]);
-	      cnt -= NUM_LEADING_ZEROS;
-	      res_ptr[N - 1] = res_ptr[N - 1] << cnt
-			       | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
-	      res_ptr[0] <<= cnt;
-	      *expt = LDBL_MIN_EXP - 1 - cnt;
-	    }
+	    count_leading_zeros (cnt, res_ptr[N - 1]);
 	  else
 	    {
 	      count_leading_zeros (cnt, res_ptr[0]);
-	      if (cnt >= NUM_LEADING_ZEROS)
-		{
-		  res_ptr[N - 1] = res_ptr[0] << (cnt - NUM_LEADING_ZEROS);
-		  res_ptr[0] = 0;
-		}
-	      else
-		{
-		  res_ptr[N - 1] = res_ptr[0] >> (NUM_LEADING_ZEROS - cnt);
-		  res_ptr[0] <<= BITS_PER_MP_LIMB - (NUM_LEADING_ZEROS - cnt);
-		}
-	      *expt = LDBL_MIN_EXP - 1
-		- (BITS_PER_MP_LIMB - NUM_LEADING_ZEROS) - cnt;
+	      exp -= BITS_PER_MP_LIMB;
+	      n = 1;
 	    }
 #else
-	  int j, k, l;
+	  int j;
 
 	  for (j = N - 1; j > 0; j--)
 	    if (res_ptr[j] != 0)
 	      break;
 
 	  count_leading_zeros (cnt, res_ptr[j]);
-	  cnt -= NUM_LEADING_ZEROS;
-	  l = N - 1 - j;
-	  if (cnt < 0)
-	    {
-	      cnt += BITS_PER_MP_LIMB;
-	      l--;
-	    }
-	  if (!cnt)
-	    for (k = N - 1; k >= l; k--)
-	      res_ptr[k] = res_ptr[k-l];
-	  else
-	    {
-	      for (k = N - 1; k > l; k--)
-		res_ptr[k] = res_ptr[k-l] << cnt
-			     | res_ptr[k-l-1] >> (BITS_PER_MP_LIMB - cnt);
-	      res_ptr[k--] = res_ptr[0] << cnt;
-	    }
-
-	  for (; k >= 0; k--)
-	    res_ptr[k] = 0;
-	  *expt = LDBL_MIN_EXP - 1 - l * BITS_PER_MP_LIMB - cnt;
+	  exp -= (N - 1 - j) * BITS_PER_MP_LIMB;
+	  n = j + 1;
 #endif
+	  *zero_bits = cnt;
+	  *expt = exp + NUM_LEADING_ZEROS - cnt;
+	  return n;
 	}
     }
   else
diff --git a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
index e46fde7..14f9ed3 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
@@ -30,157 +30,121 @@
 
 mp_size_t
 __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-			   int *expt, int *is_neg,
+			   int *expt, int *zero_bits, int *is_neg,
 			   long double value)
 {
   union ibm_extended_long_double u;
-  unsigned long long hi, lo;
+  uint64_t hi, lo;
   int ediff;
 
   u.ld = value;
 
   *is_neg = u.d[0].ieee.negative;
   *expt = (int) u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
+#define NUM_LEADING_ZEROS (128 - (LDBL_MANT_DIG + 11))
+  *zero_bits = NUM_LEADING_ZEROS;
 
-  lo = ((long long) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
-  hi = ((long long) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
+  lo = ((uint64_t) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
+  hi = ((uint64_t) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
 
-  /* If the lower double is not a denormal or zero then set the hidden
-     53rd bit.  */
-  if (u.d[1].ieee.exponent != 0)
-    lo |= 1ULL << 52;
-  else
-    lo = lo << 1;
-
-  /* The lower double is normalized separately from the upper.  We may
-     need to adjust the lower manitissa to reflect this.  */
-  ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
-  if (ediff > 0)
+  if (u.d[0].ieee.exponent != 0)
     {
-      if (ediff < 64)
-	lo = lo >> ediff;
+      /* If the high double is not a denormal or zero then set the hidden
+	 53rd bit.  */
+      hi |= (uint64_t) 1 << 52;
+
+      /* If the lower double is not a denormal or zero then set the hidden
+	 53rd bit.  */
+      if (u.d[1].ieee.exponent != 0)
+	lo |= (uint64_t) 1 << 52;
       else
-	lo = 0;
-    }
-  else if (ediff < 0)
-    lo = lo << -ediff;
-
-  /* The high double may be rounded and the low double reflects the
-     difference between the long double and the rounded high double
-     value.  This is indicated by a differnce between the signs of the
-     high and low doubles.  */
-  if (u.d[0].ieee.negative != u.d[1].ieee.negative
-      && lo != 0)
-    {
-      lo = (1ULL << 53) - lo;
-      if (hi == 0)
+	lo = lo << 1;
+
+      /* We currently only have 53 bits in lo.  Gain a few more bits
+	 of precision.  */
+      lo = lo << 11;
+
+      /* The lower double is normalized separately from the upper.  We may
+	 need to adjust the lower manitissa to reflect this.  */
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
+      if (ediff > 0)
 	{
-	  /* we have a borrow from the hidden bit, so shift left 1.  */
-	  hi = 0x0ffffffffffffeLL | (lo >> 51);
-	  lo = 0x1fffffffffffffLL & (lo << 1);
-	  (*expt)--;
+	  if (ediff < 64)
+	    lo = lo >> ediff;
+	  else
+	    lo = 0;
 	}
-      else
-	hi--;
-    }
+      else if (ediff < 0)
+	lo = lo << -ediff;
+
+      /* The high double may be rounded and the low double reflects the
+	 difference between the long double and the rounded high double
+	 value.  This is indicated by a differnce between the signs of the
+	 high and low doubles.  */
+      if (u.d[0].ieee.negative != u.d[1].ieee.negative
+	  && lo != 0)
+	{
+	  hi--;
+	  lo = -lo;
+	  if (hi < (uint64_t) 1 << 52)
+	    {
+	      /* We have a borrow from the hidden bit, so shift left 1.  */
+	      hi = (hi << 1) | (lo >> 63);
+	      lo = lo << 1;
+	      (*expt)--;
+	    }
+	}
+
 #if BITS_PER_MP_LIMB == 32
-  /* Combine the mantissas to be contiguous.  */
-  res_ptr[0] = lo;
-  res_ptr[1] = (hi << (53 - 32)) | (lo >> 32);
-  res_ptr[2] = hi >> 11;
-  res_ptr[3] = hi >> (32 + 11);
-  #define N 4
+      res_ptr[0] = lo;
+      res_ptr[1] = lo >> 32;
+      res_ptr[2] = hi;
+      res_ptr[3] = hi >> 32;
+      return 4;
 #elif BITS_PER_MP_LIMB == 64
-  /* Combine the two mantissas to be contiguous.  */
-  res_ptr[0] = (hi << 53) | lo;
-  res_ptr[1] = hi >> 11;
-  #define N 2
+      res_ptr[0] = lo;
+      res_ptr[1] = hi;
+      return 2;
 #else
-  #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
+# error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
 #endif
-/* The format does not fill the last limb.  There are some zeros.  */
-#define NUM_LEADING_ZEROS (BITS_PER_MP_LIMB \
-			   - (LDBL_MANT_DIG - ((N - 1) * BITS_PER_MP_LIMB)))
+    }
 
-  if (u.d[0].ieee.exponent == 0)
+  /* The high double is a denormal or zero.  The low double must
+     be zero.  A denormal is interpreted as having a biased
+     exponent of 1.  */
+  res_ptr[0] = hi;
+#if BITS_PER_MP_LIMB == 32
+  res_ptr[1] = hi >> 32;
+#endif
+  if (hi == 0)
     {
-      /* A biased exponent of zero is a special case.
-	 Either it is a zero or it is a denormal number.  */
-      if (res_ptr[0] == 0 && res_ptr[1] == 0
-	  && res_ptr[N - 2] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=4.  */
-	/* It's zero.  */
-	*expt = 0;
+      /* It's zero.  */
+      *expt = 0;
+      return 1;
+    }
+  else
+    {
+      int cnt;
+      int exp = 1 - IEEE754_DOUBLE_BIAS;
+      int n = 1;
+
+#if BITS_PER_MP_LIMB == 32
+      if (res_ptr[1] != 0)
+	{
+	  count_leading_zeros (cnt, res_ptr[1]);
+	  n = 2;
+	}
       else
 	{
-	  /* It is a denormal number, meaning it has no implicit leading
-	     one bit, and its exponent is in fact the format minimum.  We
-	     use DBL_MIN_EXP instead of LDBL_MIN_EXP below because the
-	     latter describes the properties of both parts together, but
-	     the exponent is computed from the high part only.  */
-	  int cnt;
-
-#if N == 2
-	  if (res_ptr[N - 1] != 0)
-	    {
-	      count_leading_zeros (cnt, res_ptr[N - 1]);
-	      cnt -= NUM_LEADING_ZEROS;
-	      res_ptr[N - 1] = res_ptr[N - 1] << cnt
-			       | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
-	      res_ptr[0] <<= cnt;
-	      *expt = DBL_MIN_EXP - 1 - cnt;
-	    }
-	  else
-	    {
-	      count_leading_zeros (cnt, res_ptr[0]);
-	      if (cnt >= NUM_LEADING_ZEROS)
-		{
-		  res_ptr[N - 1] = res_ptr[0] << (cnt - NUM_LEADING_ZEROS);
-		  res_ptr[0] = 0;
-		}
-	      else
-		{
-		  res_ptr[N - 1] = res_ptr[0] >> (NUM_LEADING_ZEROS - cnt);
-		  res_ptr[0] <<= BITS_PER_MP_LIMB - (NUM_LEADING_ZEROS - cnt);
-		}
-	      *expt = DBL_MIN_EXP - 1
-		- (BITS_PER_MP_LIMB - NUM_LEADING_ZEROS) - cnt;
-	    }
+	  count_leading_zeros (cnt, res_ptr[0]);
+	  exp -= BITS_PER_MP_LIMB;
+	}
 #else
-	  int j, k, l;
-
-	  for (j = N - 1; j > 0; j--)
-	    if (res_ptr[j] != 0)
-	      break;
-
-	  count_leading_zeros (cnt, res_ptr[j]);
-	  cnt -= NUM_LEADING_ZEROS;
-	  l = N - 1 - j;
-	  if (cnt < 0)
-	    {
-	      cnt += BITS_PER_MP_LIMB;
-	      l--;
-	    }
-	  if (!cnt)
-	    for (k = N - 1; k >= l; k--)
-	      res_ptr[k] = res_ptr[k-l];
-	  else
-	    {
-	      for (k = N - 1; k > l; k--)
-		res_ptr[k] = res_ptr[k-l] << cnt
-			     | res_ptr[k-l-1] >> (BITS_PER_MP_LIMB - cnt);
-	      res_ptr[k--] = res_ptr[0] << cnt;
-	    }
-
-	  for (; k >= 0; k--)
-	    res_ptr[k] = 0;
-	  *expt = DBL_MIN_EXP - 1 - l * BITS_PER_MP_LIMB - cnt;
+      count_leading_zeros (cnt, hi);
 #endif
-	}
+      *zero_bits = cnt;
+      *expt = exp + NUM_LEADING_ZEROS - cnt;
+      return n;
     }
-  else
-    /* Add the implicit leading one bit for a normalized number.  */
-    res_ptr[N - 1] |= (mp_limb_t) 1 << (LDBL_MANT_DIG - 1
-					- ((N - 1) * BITS_PER_MP_LIMB));
-
-  return N;
 }
diff --git a/sysdeps/ieee754/ldbl-96/ldbl2mpn.c b/sysdeps/ieee754/ldbl-96/ldbl2mpn.c
index 3f85e0a..e83171b 100644
--- a/sysdeps/ieee754/ldbl-96/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-96/ldbl2mpn.c
@@ -30,7 +30,7 @@
 
 mp_size_t
 __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
-			   int *expt, int *is_neg,
+			   int *expt, int *zero_bits, int *is_neg,
 			   long double value)
 {
   union ieee854_long_double u;
@@ -52,41 +52,38 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for"
 #endif
 
+/* The format does not fill the last limb.  There are some zeros.  */
+#define NUM_LEADING_ZEROS (N * BITS_PER_MP_LIMB - LDBL_MANT_DIG)
+  *zero_bits = NUM_LEADING_ZEROS;
+
   if (u.ieee.exponent == 0)
     {
       /* A biased exponent of zero is a special case.
 	 Either it is a zero or it is a denormal number.  */
       if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2.  */
-	/* It's zero.  */
-	*expt = 0;
+	{
+	  /* It's zero.  */
+	  *expt = 0;
+	  return 1;
+	}
       else
 	{
           /* It is a denormal number, meaning it has no implicit leading
 	     one bit, and its exponent is in fact the format minimum.  */
 	  int cnt;
+	  int exp = 1 - IEEE854_LONG_DOUBLE_BIAS;
+	  int n = N;
 
 	  if (res_ptr[N - 1] != 0)
-	    {
-	      count_leading_zeros (cnt, res_ptr[N - 1]);
-	      if (cnt != 0)
-		{
-#if N == 2
-	          res_ptr[N - 1] = res_ptr[N - 1] << cnt
-			           | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt));
-	          res_ptr[0] <<= cnt;
-#else
-	          res_ptr[N - 1] <<= cnt;
-#endif
-		}
-	      *expt = LDBL_MIN_EXP - 1 - cnt;
-	    }
+	    count_leading_zeros (cnt, res_ptr[N - 1]);
 	  else
 	    {
 	      count_leading_zeros (cnt, res_ptr[0]);
-	      res_ptr[N - 1] = res_ptr[0] << cnt;
-	      res_ptr[0] = 0;
-	      *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt;
+	      exp -= BITS_PER_MP_LIMB;
+	      n = 1;
 	    }
+	  *zero_bits = cnt;
+	  *expt = exp + NUM_LEADING_ZEROS - cnt;
 	}
     }
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d632e56c88d10119fcf262d2d66ccb8602aeaaa7

commit d632e56c88d10119fcf262d2d66ccb8602aeaaa7
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:46:49 2013 -0500

    PowerPC: floating point little endian
    
    The union loses when little-endian.

diff --git a/sysdeps/powerpc/powerpc32/power4/hp-timing.h b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
index 7d6c96e..4e42374 100644
--- a/sysdeps/powerpc/powerpc32/power4/hp-timing.h
+++ b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
@@ -87,18 +87,15 @@ typedef unsigned long long int hp_timing_t;
 
 #define HP_TIMING_NOW(Var)						\
   do {									\
-        union { long long ll; long ii[2]; } _var;			\
-	long tmp;							\
-        __asm__ __volatile__ (						\
-		"1:	mfspr	%0,269;"				\
-		"	mfspr	%1,268;"				\
-		"	mfspr	%2,269;"				\
-		"	cmpw	%0,%2;"					\
-		"	bne	1b;"					\
-		: "=r" (_var.ii[0]), "=r" (_var.ii[1]) , "=r" (tmp)	\
-		: : "cr0"						\
-		);							\
-	Var = _var.ll;							\
+    unsigned int hi, lo, tmp;						\
+    __asm__ __volatile__ ("1:	mfspr	%0,269;"			\
+			  "	mfspr	%1,268;"			\
+			  "	mfspr	%2,269;"			\
+			  "	cmpw	%0,%2;"				\
+			  "	bne	1b;"				\
+			  : "=&r" (hi), "=&r" (lo), "=&r" (tmp)		\
+			  : : "cr0");					\
+    Var = ((hp_timing_t) hi << 32) | lo;				\
   } while (0)
 
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=25541ea8e8bd404d9c987d7dd603e49c5725520a

commit 25541ea8e8bd404d9c987d7dd603e49c5725520a
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:46:19 2013 -0500

    PowerPC: floating point little-endian
    
    These all wrongly specified float constants in a 64-bit word.

diff --git a/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
index 801af5d..45f71d7 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__ceilf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_floorf.S b/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
index a0a22e7..e85b820 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__floorf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S b/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
index 876707c..b1a2b8c 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
@@ -26,8 +26,10 @@
 /* float [fp1] nearbyintf(float [fp1]) */
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__nearbyintf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_rintf.S b/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
index cb28ec7..1887717 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__rintf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_roundf.S b/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
index 980a77b..4f2c851 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
@@ -19,10 +19,12 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
 .LC1:	/* 0.5 */
-	.tc FD_3f000000_0[TC],0x3f00000000000000
+	.long 0x3f000000
+
 	.section	".text"
 
 /* float [fp1] roundf  (float x [fp1])
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_truncf.S b/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
index 5ea5f3d..b8fd050 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 /* float [fp1] truncf (float x [fp1])

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=caa36d4337b8f199af7ab63f4e6a4805ee03dc68

commit caa36d4337b8f199af7ab63f4e6a4805ee03dc68
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:45:48 2013 -0500

    PowerPC: floating point little-endian
    
    Fixes for roundf and llround

diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 2ed9ca7..8cff156 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -19,7 +19,7 @@
 #include <sysdep.h>
 
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	2
+	.align	3
 .LC0:	/* 2**23 */
 	.long 0x4b000000
 .LC1:	/* 0.5 */
@@ -60,7 +60,6 @@ ENTRY (__roundf )
 #ifdef SHARED
 	lfs	fp10,.LC1-.LC0(r9)
 #else
-	lis	r9,.LC1@ha
 	lfs	fp10,.LC1@l(r9)
 #endif
 	ble-	cr6,.L4
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index 631180f..7246ca4 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -19,12 +19,10 @@
 #include <sysdep.h>
 #include <math_ldbl_opt.h>
 
- .section .rodata.cst12,"aM",@progbits,12
+ .section .rodata.cst8,"aM",@progbits,8
  .align 3
- .LC0:   /* 0x1.0000000000000p+52 == 2^52 */
-	.long 0x43300000
-	.long 0x00000000
-	.long 0x3f000000 /* Use this for 0.5  */
+ .LC0:	.long (52+127)<<23 /* 0x1p+52  */
+	.long (-1+127)<<23 /* 0.5  */
 
 	.section	".text"
 
@@ -57,12 +55,12 @@ ENTRY (__llround)
 	addi	r9,r9,.LC0-got_label@l
 	mtlr	r11
 	cfi_same_value (lr)
-	lfd	fp9,0(r9)
-	lfs	fp10,8(r9)
+	lfs	fp9,0(r9)
+	lfs	fp10,4(r9)
 #else
 	lis r9,.LC0@ha
-	lfd fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
-	lfs fp10,.LC0@l+8(r9)	/* Load 0.5 into fpr10.  */
+	lfs fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
+	lfs fp10,.LC0@l+4(r9)	/* Load 0.5 into fpr10.  */
 #endif
 	fabs	fp2,fp1		/* Get the absolute value of x.  */
 	fsub	fp12,fp10,fp10	/* Compute 0.0 into fpr12.  */
@@ -80,8 +78,8 @@ ENTRY (__llround)
 	nop
 	nop
 	nop
-	lwz	r4,12(r1)	/* Load return as integer.  */
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)	/* Load return as integer.  */
+	lwz	r4,8+LOWORD(r1)
 .Lout:
 	addi	r1,r1,16
 	blr

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a57f52f0a5b436e9b30778ba8005202ad9bbb9b8

commit a57f52f0a5b436e9b30778ba8005202ad9bbb9b8
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:45:24 2013 -0500

    PowerPC: floating point little-endian
    
    Fixes for little-endian in 32-bit assembly.

diff --git a/sysdeps/powerpc/powerpc32/fpu/s_copysign.S b/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
index 840891f..1da24f4 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
@@ -29,7 +29,7 @@ ENTRY(__copysign)
 	stwu	r1,-16(r1)
 	cfi_adjust_cfa_offset (16)
 	stfd	fp2,8(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi   r3,0
 	addi    r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S b/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
index 4ec8389..2ad6de2 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
@@ -30,7 +30,7 @@ ENTRY(__copysignl)
 	fmr	fp0,fp1
 	fabs	fp1,fp1
 	fcmpu	cr7,fp0,fp1
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi	cr6,r3,0
 	addi	r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lrint.S b/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
index 27881f8..249fda5 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
@@ -24,10 +24,10 @@ ENTRY (__lrint)
 	stwu	r1,-16(r1)
 	fctiw	fp13,fp1
 	stfd	fp13,8(r1)
-	nop	/* Insure the following load is in a different dispatch group */
+	nop	/* Ensure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lrint)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index 92dc378..6309f86 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -67,7 +67,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)	/* Load return as integer.  */
+	lwz	r3,8+LOWORD(r1)	/* Load return as integer.  */
 .Lout:
 	addi	r1,r1,16
 	blr
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
index 55b2850..e7a88fe 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
index cc80fcb..da24ad3 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
index ecd37c3..49c8a08 100644
--- a/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
index d4da625..780dd9c 100644
--- a/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
@@ -38,7 +38,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lround)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
index f2417fd..5f7ba43 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
index 2c095db..3ea1858 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
index 3344b31..c0660cf 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
index 7f64f8d..ce29890 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
index 0ff04cb..abb0840 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
index b2ab5bf..095c155 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -54,9 +54,8 @@ ENTRY (__finite)
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz     r0,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r0,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
index 3f8af60..0101c8f 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -48,14 +48,13 @@ ENTRY (__isinf)
 	li	r3,0
 	bflr    29	      /* If not INF, return.  */
 
-	/* Either we have -INF/+INF or a denormal.  */
+	/* Either we have +INF or -INF.  */
 
 	stwu    r1,-16(r1)    /* Allocate stack space.  */
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz	r4,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r4,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	li	r3,1
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
index 99ff126..0ad1dcf 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -53,8 +53,8 @@ ENTRY (__isnan)
 	stwu	r1,-16(r1)    /* Allocate stack space.  */
 	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lwz     r4,8(r1)      /* Load the upper half of the FP value.  */
-	lwz     r5,12(r1)     /* Load the lower half of the FP value.  */
+	lwz     r4,8+HIWORD(r1) /* Load the upper half of the FP value.  */
+	lwz     r5,8+LOWORD(r1) /* Load the lower half of the FP value.  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
 	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
index d0071c7..ebec0e0 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -39,10 +39,8 @@ EALIGN (__finite, 4, 0)
 
 	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-
-	lhz     r4,-16(r1)    /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz     r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
 	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	bltlr   cr7	      /* LT means finite, other non-finite.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
index 1aea123..8d088db 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -38,9 +38,8 @@ EALIGN (__isinf, 4, 0)
 
 	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz	r4,-16(r1)    /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	li	r3,1
 	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h
index 1b5334a..616f3ac 100644
--- a/sysdeps/powerpc/sysdep.h
+++ b/sysdeps/powerpc/sysdep.h
@@ -144,6 +144,21 @@
 
 #define VRSAVE	256
 
+/* The 32-bit words of a 64-bit dword are at these offsets in memory.  */
+# if defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define LOWORD 0
+#define HIWORD 4
+#else
+#define LOWORD 4
+#define HIWORD 0
+#endif
+
+/* The high 16-bit word of a 64-bit dword is at this offset in memory.  */
+# if defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+#define HISHORT 6
+#else
+#define HISHORT 0
+#endif
 
 /* This seems to always be the case on PPC.  */
 #define ALIGNARG(log2) log2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=66a5f224b37e91de885b95dff47ceaf20ee3b5f3

commit 66a5f224b37e91de885b95dff47ceaf20ee3b5f3
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:44:42 2013 -0500

    PowerPC: floating point little-endian
    
    Another little-endian fix.

diff --git a/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c b/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
index feffa6b..cc9b320 100644
--- a/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
+++ b/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
@@ -83,7 +83,7 @@ ElfW(Addr) query_auxv(int type)
   return 0;
 }
 
-typedef unsigned long long di_fpscr_t __attribute__ ((__mode__ (__DI__)));
+typedef unsigned int di_fpscr_t __attribute__ ((__mode__ (__DI__)));
 typedef unsigned int si_fpscr_t __attribute__ ((__mode__ (__SI__)));
 
 #define _FPSCR_RESERVED 0xfffffff8ffffff04ULL
@@ -95,50 +95,51 @@ typedef unsigned int si_fpscr_t __attribute__ ((__mode__ (__SI__)));
 #define _FPSCR_TEST1_RN  0x0000000000000002ULL
 
 /* Macros for accessing the hardware control word on Power6[x].  */
-# define _GET_DI_FPSCR(__fpscr) ({					     \
-   union { double d;							     \
-           di_fpscr_t fpscr; }						     \
-     tmp __attribute__ ((__aligned__(8)));				     \
-   __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0");		     \
-   (__fpscr)=tmp.fpscr;							     \
-   tmp.fpscr; })
-
-/* We make sure to zero fp0 after we use it in order to prevent stale data
+#define _GET_DI_FPSCR(__fpscr)						\
+  ({union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    __asm__ ("mffs %0" : "=f" (fr));					\
+    u.d = fr;								\
+    (__fpscr) = u.fpscr;						\
+    u.fpscr;								\
+  })
+
+/* We make sure to zero fp after we use it in order to prevent stale data
    in an fp register from making a test-case pass erroneously.  */
-# define _SET_DI_FPSCR(__fpscr) {					     \
-  union { double d; di_fpscr_t fpscr; }					     \
-    tmp __attribute__ ((__aligned__(8)));				     \
-  tmp.fpscr = __fpscr;							     \
-  /* Set the entire 64-bit FPSCR.  */					     \
-  __asm__ ("lfd%U0 0,%0; "						     \
-	   ".machine push; "						     \
-	   ".machine \"power6\"; "					     \
-	   "mtfsf 255,0,1,0; "						     \
-	   ".machine pop" : : "m" (tmp.d) : "fr0");			     \
-  tmp.d = 0;								     \
-  __asm__("lfd%U0 0,%0" : : "m" (tmp.d) : "fr0");			     \
-}
-
-# define _GET_SI_FPSCR(__fpscr) ({					     \
-   union { double d;							     \
-           si_fpscr_t cw[2]; }						     \
-     tmp __attribute__ ((__aligned__(8)));				     \
-   __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0");		     \
-   (__fpscr)=tmp.cw[1];							     \
-   tmp.cw[0]; })
-
-/* We make sure to zero fp0 after we use it in order to prevent stale data
+# define _SET_DI_FPSCR(__fpscr)						\
+  { union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    u.fpscr = __fpscr;							\
+    fr = u.d;								\
+    /* Set the entire 64-bit FPSCR.  */					\
+    __asm__ (".machine push; "						\
+	     ".machine \"power6\"; "					\
+	     "mtfsf 255,%0,1,0; "					\
+	     ".machine pop" : : "f" (fr));				\
+    fr = 0.0;								\
+  }
+
+# define _GET_SI_FPSCR(__fpscr)						\
+  ({union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    __asm__ ("mffs %0" : "=f" (fr));					\
+    u.d = fr;								\
+    (__fpscr) = (si_fpscr_t) u.fpscr;					\
+    (si_fpscr_t) u.fpscr;						\
+  })
+
+/* We make sure to zero fp after we use it in order to prevent stale data
    in an fp register from making a test-case pass erroneously.  */
-# define _SET_SI_FPSCR(__fpscr) {					     \
-  union { double d; si_fpscr_t fpscr[2]; }				     \
-    tmp __attribute__ ((__aligned__(8)));				     \
-  /* More-or-less arbitrary; this is a QNaN. */				     \
-  tmp.fpscr[0] = 0xFFF80000;						     \
-  tmp.fpscr[1] = __fpscr;						     \
-  __asm__ ("lfd%U0 0,%0; mtfsf 255,0" : : "m" (tmp.d) : "fr0");		     \
-  tmp.d = 0;								     \
-  __asm__("lfd%U0 0,%0" : : "m" (tmp.d) : "fr0");			     \
-}
+# define _SET_SI_FPSCR(__fpscr)						\
+  { union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    /* More-or-less arbitrary; this is a QNaN. */			\
+    u.fpscr = 0xfff80000ULL << 32;					\
+    u.fpscr |= __fpscr & 0xffffffffULL;					\
+    fr = u.d;								\
+    __asm__ ("mtfsf 255,%0" : : "f" (fr));				\
+    fr = 0.0;								\
+  }
 
 void prime_special_regs(int which)
 {
diff --git a/sysdeps/powerpc/fpu_control.h b/sysdeps/powerpc/fpu_control.h
index 159543b..b70bb4a 100644
--- a/sysdeps/powerpc/fpu_control.h
+++ b/sysdeps/powerpc/fpu_control.h
@@ -56,22 +56,26 @@ extern fpu_control_t __fpu_control;
 # define _FPU_IEEE     0x000000f0
 
 /* Type of the control word.  */
-typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__)));
+typedef unsigned int fpu_control_t;
 
 /* Macros for accessing the hardware control word.  */
-# define _FPU_GETCW(__cw) ( { \
-  union { double d; fpu_control_t cw[2]; } \
-    tmp __attribute__ ((__aligned__(8))); \
-  __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0"); \
-  (__cw)=tmp.cw[1]; \
-  tmp.cw[1]; } )
-# define _FPU_SETCW(__cw) { \
-  union { double d; fpu_control_t cw[2]; } \
-    tmp __attribute__ ((__aligned__(8))); \
-  tmp.cw[0] = 0xFFF80000; /* More-or-less arbitrary; this is a QNaN. */ \
-  tmp.cw[1] = __cw; \
-  __asm__ ("lfd%U0 0,%0; mtfsf 255,0" : : "m" (tmp.d) : "fr0"); \
-}
+# define _FPU_GETCW(__cw)				\
+  ({union { double d; unsigned long long cw; } u;	\
+    register double fr;					\
+    __asm__ ("mffs %0" : "=f" (fr));			\
+    u.d = fr;						\
+    (__cw) = (fpu_control_t) u.cw;			\
+    (fpu_control_t) u.cw;				\
+  })
+
+# define _FPU_SETCW(__cw)				\
+  { union { double d; unsigned long long cw; } u;	\
+    register double fr;					\
+    u.cw = 0xfff80000LL << 32; /* This is a QNaN.  */	\
+    u.cw |= __cw & 0xffffffffLL;			\
+    fr = u.d;						\
+    __asm__ ("mtfsf 255,%0" : : "f" (fr));		\
+  }
 
 /* Default control word set at startup.  */
 extern fpu_control_t __fpu_control;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=902fbff852b9fc31bd5759a874e3cfc74a19282f

commit 902fbff852b9fc31bd5759a874e3cfc74a19282f
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:44:17 2013 -0500

    PowerPC: floating point little-endian
    
    These two functions oddly test x+1>0 when a double x is >= 0.0, and
    similarly when x is negative.  I don't see the point of that since the
    test should always be true.  I also don't see any need to convert x+1
    to integer rather than simply using xr+1.  Note that the standard
    allows these functions to return any value when the input is outside
    the range of long long, but it's not too hard to prevent xr+1
    overflowing so that's what I've done.
    
    (With rounding mode FE_UPWARD, x+1 can be a lot more than what you
    might naively expect, but perhaps that situation was covered by the
    x - xrf < 1.0 test.)

diff --git a/sysdeps/powerpc/fpu/s_llround.c b/sysdeps/powerpc/fpu/s_llround.c
index 9a01826..995d0a7 100644
--- a/sysdeps/powerpc/fpu/s_llround.c
+++ b/sysdeps/powerpc/fpu/s_llround.c
@@ -19,29 +19,28 @@
 #include <math.h>
 #include <math_ldbl_opt.h>
 
-/* I think that what this routine is supposed to do is round a value
-   to the nearest integer, with values exactly on the boundary rounded
-   away from zero.  */
-/* This routine relies on (long long)x, when x is out of range of a long long,
-   clipping to MAX_LLONG or MIN_LLONG.  */
+/* Round to the nearest integer, with values exactly on a 0.5 boundary
+   rounded away from zero, regardless of the current rounding mode.
+   If (long long)x, when x is out of range of a long long, clips at
+   LLONG_MAX or LLONG_MIN, then this implementation also clips.  */
 
 long long int
 __llround (double x)
 {
-  double xrf;
-  long long int xr;
-  xr = (long long int) x;
-  xrf = (double) xr;
+  long long xr = (long long) x;
+  double xrf = (double) xr;
+
   if (x >= 0.0)
-    if (x - xrf >= 0.5 && x - xrf < 1.0 && x+1 > 0)
-      return x+1;
-    else
-      return x;
+    {
+      if (x - xrf >= 0.5)
+	xr += (long long) ((unsigned long long) xr + 1) > 0;
+    }
   else
-    if (xrf - x >= 0.5 && xrf - x < 1.0 && x-1 < 0)
-      return x-1;
-    else
-      return x;
+    {
+      if (xrf - x >= 0.5)
+	xr -= (long long) ((unsigned long long) xr - 1) < 0;
+    }
+  return xr;
 }
 weak_alias (__llround, llround)
 #ifdef NO_LONG_DOUBLE
diff --git a/sysdeps/powerpc/fpu/s_llroundf.c b/sysdeps/powerpc/fpu/s_llroundf.c
index 07d12ad..0935de6 100644
--- a/sysdeps/powerpc/fpu/s_llroundf.c
+++ b/sysdeps/powerpc/fpu/s_llroundf.c
@@ -18,28 +18,27 @@
 
 #include <math.h>
 
-/* I think that what this routine is supposed to do is round a value
-   to the nearest integer, with values exactly on the boundary rounded
-   away from zero.  */
-/* This routine relies on (long long)x, when x is out of range of a long long,
-   clipping to MAX_LLONG or MIN_LLONG.  */
+/* Round to the nearest integer, with values exactly on a 0.5 boundary
+   rounded away from zero, regardless of the current rounding mode.
+   If (long long)x, when x is out of range of a long long, clips at
+   LLONG_MAX or LLONG_MIN, then this implementation also clips.  */
 
 long long int
 __llroundf (float x)
 {
-  float xrf;
-  long long int xr;
-  xr = (long long int) x;
-  xrf = (float) xr;
+  long long xr = (long long) x;
+  float xrf = (float) xr;
+
   if (x >= 0.0)
-    if (x - xrf >= 0.5 && x - xrf < 1.0 && x+1 > 0)
-      return x+1;
-    else
-      return x;
+    {
+      if (x - xrf >= 0.5)
+	xr += (long long) ((unsigned long long) xr + 1) > 0;
+    }
   else
-    if (xrf - x >= 0.5 && xrf - x < 1.0 && x-1 < 0)
-      return x-1;
-    else
-      return x;
+    {
+      if (xrf - x >= 0.5)
+	xr -= (long long) ((unsigned long long) xr - 1) < 0;
+    }
+  return xr;
 }
 weak_alias (__llroundf, llroundf)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b6e54c67ab3850f4298c562182ef3ab03dbde19d

commit b6e54c67ab3850f4298c562182ef3ab03dbde19d
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:44:02 2013 -0500

    PowerPC: floating point little-endian
    
    This works around the fact that vsx is disabled in current
    little-endian gcc.  Also, float constants take 4 bytes in memory
    vs. 16 bytes for vector constants, and we don't need to write one lot
    of masks for double (register format) and another for float (mem
    format).

diff --git a/sysdeps/powerpc/fpu/s_float_bitwise.h b/sysdeps/powerpc/fpu/s_float_bitwise.h
index 8e4adca..c0a4e56 100644
--- a/sysdeps/powerpc/fpu/s_float_bitwise.h
+++ b/sysdeps/powerpc/fpu/s_float_bitwise.h
@@ -23,18 +23,19 @@
 #include <math_private.h>
 
 /* Returns (int)(num & 0x7FFFFFF0 == value) */
-static inline
-int __float_and_test28 (float num, float value)
+static inline int
+__float_and_test28 (float num, float value)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7ffffffe, 0x00000000, 0x00000000, 0x0000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7ffffff0 };
   __asm__ (
-  /* the 'f' constrain is use on mask because we just need
+  /* the 'f' constraint is used on mask because we just need
    * to compare floats, not full vector */
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -46,16 +47,17 @@ int __float_and_test28 (float num, float value)
 }
 
 /* Returns (int)(num & 0x7FFFFF00 == value) */
-static inline
-int __float_and_test24 (float num, float value)
+static inline int
+__float_and_test24 (float num, float value)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7fffffe0, 0x00000000, 0x00000000, 0x0000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7fffff00 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -67,16 +69,17 @@ int __float_and_test24 (float num, float value)
 }
 
 /* Returns (float)(num & 0x7F800000) */
-static inline
-float __float_and8 (float num)
+static inline float
+__float_and8 (float num)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7ff00000, 0x00000000, 0x00000000, 0x00000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7f800000 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -88,17 +91,18 @@ float __float_and8 (float num)
 }
 
 /* Returns ((int32_t)(num & 0x7F800000) >> 23) */
-static inline
-int32_t __float_get_exp (float num)
+static inline int32_t
+__float_get_exp (float num)
 {
   int32_t inum;
 #ifdef _ARCH_PWR7
   float ret;
-  vector int mask = (vector int) {
-    0x7ff00000, 0x00000000, 0x00000000, 0x00000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7f800000 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
   GET_FLOAT_WORD(inum, ret);
 #else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cd9f4394705e03bdbfdae446da713fa1ab1080f4

commit cd9f4394705e03bdbfdae446da713fa1ab1080f4
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:37:51 2013 -0500

    PowerPC: floating point little-endian
    
    Corrects floating-point environment code for little-endian.

diff --git a/sysdeps/powerpc/fpu/e_sqrt.c b/sysdeps/powerpc/fpu/e_sqrt.c
index 3efe277..2d50fb5 100644
--- a/sysdeps/powerpc/fpu/e_sqrt.c
+++ b/sysdeps/powerpc/fpu/e_sqrt.c
@@ -145,7 +145,7 @@ __slow_ieee754_sqrt (double x)
       feraiseexcept (FE_INVALID_SQRT);
 
       fenv_union_t u = { .fenv = fegetenv_register () };
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 #endif
 	feraiseexcept (FE_INVALID);
       x = a_nan.value;
diff --git a/sysdeps/powerpc/fpu/e_sqrtf.c b/sysdeps/powerpc/fpu/e_sqrtf.c
index 6e50a3c..91d2d37 100644
--- a/sysdeps/powerpc/fpu/e_sqrtf.c
+++ b/sysdeps/powerpc/fpu/e_sqrtf.c
@@ -121,7 +121,7 @@ __slow_ieee754_sqrtf (float x)
       feraiseexcept (FE_INVALID_SQRT);
 
       fenv_union_t u = { .fenv = fegetenv_register () };
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 #endif
 	feraiseexcept (FE_INVALID);
       x = a_nan.value;
diff --git a/sysdeps/powerpc/fpu/fclrexcpt.c b/sysdeps/powerpc/fpu/fclrexcpt.c
index 86575db..7f66e21 100644
--- a/sysdeps/powerpc/fpu/fclrexcpt.c
+++ b/sysdeps/powerpc/fpu/fclrexcpt.c
@@ -28,8 +28,8 @@ __feclearexcept (int excepts)
   u.fenv = fegetenv_register ();
 
   /* Clear the relevant bits.  */
-  u.l[1] = u.l[1] & ~((-(excepts >> (31 - FPSCR_VX) & 1) & FE_ALL_INVALID)
-		      | (excepts & FPSCR_STICKY_BITS));
+  u.l = u.l & ~((-(excepts >> (31 - FPSCR_VX) & 1) & FE_ALL_INVALID)
+		| (excepts & FPSCR_STICKY_BITS));
 
   /* Put the new state in effect.  */
   fesetenv_register (u.fenv);
diff --git a/sysdeps/powerpc/fpu/fedisblxcpt.c b/sysdeps/powerpc/fpu/fedisblxcpt.c
index 659566b..f2c45a6 100644
--- a/sysdeps/powerpc/fpu/fedisblxcpt.c
+++ b/sysdeps/powerpc/fpu/fedisblxcpt.c
@@ -32,15 +32,15 @@ fedisableexcept (int excepts)
 
   fe.fenv = fegetenv_register ();
   if (excepts & FE_INEXACT)
-    fe.l[1] &= ~(1 << (31 - FPSCR_XE));
+    fe.l &= ~(1 << (31 - FPSCR_XE));
   if (excepts & FE_DIVBYZERO)
-    fe.l[1] &= ~(1 << (31 - FPSCR_ZE));
+    fe.l &= ~(1 << (31 - FPSCR_ZE));
   if (excepts & FE_UNDERFLOW)
-    fe.l[1] &= ~(1 << (31 - FPSCR_UE));
+    fe.l &= ~(1 << (31 - FPSCR_UE));
   if (excepts & FE_OVERFLOW)
-    fe.l[1] &= ~(1 << (31 - FPSCR_OE));
+    fe.l &= ~(1 << (31 - FPSCR_OE));
   if (excepts & FE_INVALID)
-    fe.l[1] &= ~(1 << (31 - FPSCR_VE));
+    fe.l &= ~(1 << (31 - FPSCR_VE));
   fesetenv_register (fe.fenv);
 
   new = __fegetexcept ();
diff --git a/sysdeps/powerpc/fpu/feenablxcpt.c b/sysdeps/powerpc/fpu/feenablxcpt.c
index fc4bfff..472796d 100644
--- a/sysdeps/powerpc/fpu/feenablxcpt.c
+++ b/sysdeps/powerpc/fpu/feenablxcpt.c
@@ -32,15 +32,15 @@ feenableexcept (int excepts)
 
   fe.fenv = fegetenv_register ();
   if (excepts & FE_INEXACT)
-    fe.l[1] |= (1 << (31 - FPSCR_XE));
+    fe.l |= (1 << (31 - FPSCR_XE));
   if (excepts & FE_DIVBYZERO)
-    fe.l[1] |= (1 << (31 - FPSCR_ZE));
+    fe.l |= (1 << (31 - FPSCR_ZE));
   if (excepts & FE_UNDERFLOW)
-    fe.l[1] |= (1 << (31 - FPSCR_UE));
+    fe.l |= (1 << (31 - FPSCR_UE));
   if (excepts & FE_OVERFLOW)
-    fe.l[1] |= (1 << (31 - FPSCR_OE));
+    fe.l |= (1 << (31 - FPSCR_OE));
   if (excepts & FE_INVALID)
-    fe.l[1] |= (1 << (31 - FPSCR_VE));
+    fe.l |= (1 << (31 - FPSCR_VE));
   fesetenv_register (fe.fenv);
 
   new = __fegetexcept ();
diff --git a/sysdeps/powerpc/fpu/fegetexcept.c b/sysdeps/powerpc/fpu/fegetexcept.c
index f3d5724..23d47a2 100644
--- a/sysdeps/powerpc/fpu/fegetexcept.c
+++ b/sysdeps/powerpc/fpu/fegetexcept.c
@@ -27,15 +27,15 @@ __fegetexcept (void)
 
   fe.fenv = fegetenv_register ();
 
-  if (fe.l[1] & (1 << (31 - FPSCR_XE)))
+  if (fe.l & (1 << (31 - FPSCR_XE)))
       result |= FE_INEXACT;
-  if (fe.l[1] & (1 << (31 - FPSCR_ZE)))
+  if (fe.l & (1 << (31 - FPSCR_ZE)))
       result |= FE_DIVBYZERO;
-  if (fe.l[1] & (1 << (31 - FPSCR_UE)))
+  if (fe.l & (1 << (31 - FPSCR_UE)))
       result |= FE_UNDERFLOW;
-  if (fe.l[1] & (1 << (31 - FPSCR_OE)))
+  if (fe.l & (1 << (31 - FPSCR_OE)))
       result |= FE_OVERFLOW;
-  if (fe.l[1] & (1 << (31 - FPSCR_VE)))
+  if (fe.l & (1 << (31 - FPSCR_VE)))
       result |= FE_INVALID;
 
   return result;
diff --git a/sysdeps/powerpc/fpu/feholdexcpt.c b/sysdeps/powerpc/fpu/feholdexcpt.c
index 013d2bf..0ecf0f7 100644
--- a/sysdeps/powerpc/fpu/feholdexcpt.c
+++ b/sysdeps/powerpc/fpu/feholdexcpt.c
@@ -30,13 +30,12 @@ feholdexcept (fenv_t *envp)
 
   /* Clear everything except for the rounding modes and non-IEEE arithmetic
      flag.  */
-  new.l[1] = old.l[1] & 7;
-  new.l[0] = old.l[0];
+  new.l = old.l & 0xffffffff00000007LL;
 
   /* If the old env had any enabled exceptions, then mask SIGFPE in the
      MSR FE0/FE1 bits.  This may allow the FPU to run faster because it
      always takes the default action and can not generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) != 0)
     (void)__fe_mask_env ();
 
   /* Put the new state in effect.  */
diff --git a/sysdeps/powerpc/fpu/fenv_libc.h b/sysdeps/powerpc/fpu/fenv_libc.h
index 1910951..baa2a7d 100644
--- a/sysdeps/powerpc/fpu/fenv_libc.h
+++ b/sysdeps/powerpc/fpu/fenv_libc.h
@@ -69,7 +69,7 @@ libm_hidden_proto (__fe_nomask_env)
 typedef union
 {
   fenv_t fenv;
-  unsigned int l[2];
+  unsigned long long l;
 } fenv_union_t;
 
 
diff --git a/sysdeps/powerpc/fpu/fesetenv.c b/sysdeps/powerpc/fpu/fesetenv.c
index e92adb4..6c00b26 100644
--- a/sysdeps/powerpc/fpu/fesetenv.c
+++ b/sysdeps/powerpc/fpu/fesetenv.c
@@ -34,14 +34,14 @@ __fesetenv (const fenv_t *envp)
      exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put the
      hardware into "precise mode" and may cause the FPU to run slower on some
      hardware.  */
-  if ((old.l[1] & _FPU_MASK_ALL) == 0 && (new.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
     (void)__fe_nomask_env ();
 
   /* If the old env had any enabled exceptions and the new env has no enabled
      exceptions, then mask SIGFPE in the MSR FE0/FE1 bits.  This may allow the
      FPU to run faster because it always takes the default action and can not
      generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0 && (new.l[1] & _FPU_MASK_ALL) == 0)
+  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
     (void)__fe_mask_env ();
 
   fesetenv_register (*envp);
diff --git a/sysdeps/powerpc/fpu/feupdateenv.c b/sysdeps/powerpc/fpu/feupdateenv.c
index 6500ea1..6775044 100644
--- a/sysdeps/powerpc/fpu/feupdateenv.c
+++ b/sysdeps/powerpc/fpu/feupdateenv.c
@@ -34,20 +34,20 @@ __feupdateenv (const fenv_t *envp)
   /* Restore rounding mode and exception enable from *envp and merge
      exceptions.  Leave fraction rounded/inexact and FP result/CC bits
      unchanged.  */
-  new.l[1] = (old.l[1] & 0x1FFFFF00) | (new.l[1] & 0x1FF80FFF);
+  new.l = (old.l & 0xffffffff1fffff00LL) | (new.l & 0x1ff80fff);
 
   /* If the old env has no enabled exceptions and the new env has any enabled
      exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put
      the hardware into "precise mode" and may cause the FPU to run slower on
      some hardware.  */
-  if ((old.l[1] & _FPU_MASK_ALL) == 0 && (new.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
     (void)__fe_nomask_env ();
 
   /* If the old env had any enabled exceptions and the new env has no enabled
      exceptions, then mask SIGFPE in the MSR FE0/FE1 bits.  This may allow the
      FPU to run faster because it always takes the default action and can not
      generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0 && (new.l[1] & _FPU_MASK_ALL) == 0)
+  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
     (void)__fe_mask_env ();
 
   /* Atomically enable and raise (if appropriate) exceptions set in `new'. */
diff --git a/sysdeps/powerpc/fpu/fgetexcptflg.c b/sysdeps/powerpc/fpu/fgetexcptflg.c
index f6327ce..1395bed 100644
--- a/sysdeps/powerpc/fpu/fgetexcptflg.c
+++ b/sysdeps/powerpc/fpu/fgetexcptflg.c
@@ -27,7 +27,7 @@ __fegetexceptflag (fexcept_t *flagp, int excepts)
   u.fenv = fegetenv_register ();
 
   /* Return (all of) it.  */
-  *flagp = u.l[1] & excepts & FE_ALL_EXCEPT;
+  *flagp = u.l & excepts & FE_ALL_EXCEPT;
 
   /* Success.  */
   return 0;
diff --git a/sysdeps/powerpc/fpu/fraiseexcpt.c b/sysdeps/powerpc/fpu/fraiseexcpt.c
index 9118c19..6193071 100644
--- a/sysdeps/powerpc/fpu/fraiseexcpt.c
+++ b/sysdeps/powerpc/fpu/fraiseexcpt.c
@@ -33,11 +33,11 @@ __feraiseexcept (int excepts)
   u.fenv = fegetenv_register ();
 
   /* Add the exceptions */
-  u.l[1] = (u.l[1]
-	    | (excepts & FPSCR_STICKY_BITS)
-	    /* Turn FE_INVALID into FE_INVALID_SOFTWARE.  */
-	    | (excepts >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
-	       & FE_INVALID_SOFTWARE));
+  u.l = (u.l
+	 | (excepts & FPSCR_STICKY_BITS)
+	 /* Turn FE_INVALID into FE_INVALID_SOFTWARE.  */
+	 | (excepts >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
+	    & FE_INVALID_SOFTWARE));
 
   /* Store the new status word (along with the rest of the environment),
      triggering any appropriate exceptions.  */
@@ -49,7 +49,7 @@ __feraiseexcept (int excepts)
 	 don't have FE_INVALID_SOFTWARE implemented.  Detect this
 	 case and raise FE_INVALID_SNAN instead.  */
       u.fenv = fegetenv_register ();
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 	set_fpscr_bit (FPSCR_VXSNAN);
     }
 
diff --git a/sysdeps/powerpc/fpu/fsetexcptflg.c b/sysdeps/powerpc/fpu/fsetexcptflg.c
index c050d40..0d309c8 100644
--- a/sysdeps/powerpc/fpu/fsetexcptflg.c
+++ b/sysdeps/powerpc/fpu/fsetexcptflg.c
@@ -31,10 +31,10 @@ __fesetexceptflag (const fexcept_t *flagp, int excepts)
   flag = *flagp & excepts;
 
   /* Replace the exception status */
-  u.l[1] = ((u.l[1] & ~(FPSCR_STICKY_BITS & excepts))
-	    | (flag & FPSCR_STICKY_BITS)
-	    | (flag >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
-	       & FE_INVALID_SOFTWARE));
+  u.l = ((u.l & ~(FPSCR_STICKY_BITS & excepts))
+	 | (flag & FPSCR_STICKY_BITS)
+	 | (flag >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
+	    & FE_INVALID_SOFTWARE));
 
   /* Store the new status word (along with the rest of the environment).
      This may cause floating-point exceptions if the restored state
diff --git a/sysdeps/powerpc/fpu/ftestexcept.c b/sysdeps/powerpc/fpu/ftestexcept.c
index 0dbc3be..86eea0f 100644
--- a/sysdeps/powerpc/fpu/ftestexcept.c
+++ b/sysdeps/powerpc/fpu/ftestexcept.c
@@ -28,6 +28,6 @@ fetestexcept (int excepts)
 
   /* The FE_INVALID bit is dealt with correctly by the hardware, so we can
      just:  */
-  return u.l[1] & excepts;
+  return u.l & excepts;
 }
 libm_hidden_def (fetestexcept)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4668c955eb57378029134de6b6387c3eba08d78f

commit 4668c955eb57378029134de6b6387c3eba08d78f
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:37:34 2013 -0500

    PowerPC: floating point little-endian
    
    More little-endian fixes.

diff --git a/sysdeps/powerpc/bits/mathinline.h b/sysdeps/powerpc/bits/mathinline.h
index 140fff0..cef5b29 100644
--- a/sysdeps/powerpc/bits/mathinline.h
+++ b/sysdeps/powerpc/bits/mathinline.h
@@ -61,21 +61,28 @@
 __MATH_INLINE int
 __NTH (__signbitf (float __x))
 {
+#if __GNUC_PREREQ (4, 0)
+  return __builtin_signbitf (__x);
+#else
   __extension__ union { float __f; int __i; } __u = { __f: __x };
   return __u.__i < 0;
+#endif
 }
 __MATH_INLINE int
 __NTH (__signbit (double __x))
 {
-  __extension__ union { double __d; int __i[2]; } __u = { __d: __x };
-  return __u.__i[0] < 0;
+#if __GNUC_PREREQ (4, 0)
+  return __builtin_signbit (__x);
+#else
+  __extension__ union { double __d; long long __i; } __u = { __d: __x };
+  return __u.__i < 0;
+#endif
 }
 #  ifdef __LONG_DOUBLE_128__
 __MATH_INLINE int
 __NTH (__signbitl (long double __x))
 {
-  __extension__ union { long double __d; int __i[4]; } __u = { __d: __x };
-  return __u.__i[0] < 0;
+  return __signbit ((double) __x);
 }
 #  endif
 # endif
@@ -92,22 +99,17 @@ __NTH (lrint (double __x))
 {
   union {
     double __d;
-    int __ll[2];
+    long long __ll;
   } __u;
   __asm__ ("fctiw %0,%1" : "=f"(__u.__d) : "f"(__x));
-  return __u.__ll[1];
+  return __u.__ll;
 }
 
 __MATH_INLINE long int lrintf (float __x) __THROW;
 __MATH_INLINE long int
 __NTH (lrintf (float __x))
 {
-  union {
-    double __d;
-    int __ll[2];
-  } __u;
-  __asm__ ("fctiw %0,%1" : "=f"(__u.__d) : "f"(__x));
-  return __u.__ll[1];
+  return lrint ((double) __x);
 }
 # endif
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9d6db3f1927c8f7c9f47da54fa90d83195cbfd53

commit 9d6db3f1927c8f7c9f47da54fa90d83195cbfd53
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:37:15 2013 -0500

    PowerPC: floating point little-endian
    
    A rewrite to make this code correct for little-endian.

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c b/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
index 2b0f7c6..61feb36 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
@@ -34,15 +34,13 @@
 
 #include <math_private.h>
 
-typedef unsigned int int4;
-typedef union {int4 i[4]; long double x; double d[2]; } mynumber;
+typedef union {int64_t i[2]; long double x; double d[2]; } mynumber;
 
-static const  mynumber
-  t512 = {{0x5ff00000, 0x00000000, 0x00000000, 0x00000000 }},  /* 2^512  */
-  tm256 = {{0x2ff00000, 0x00000000, 0x00000000, 0x00000000 }};  /* 2^-256 */
 static const double
-two54 = 1.80143985094819840000e+16, /* 0x4350000000000000 */
-twom54 = 5.55111512312578270212e-17; /* 0x3C90000000000000 */
+  t512 = 0x1p512,
+  tm256 = 0x1p-256,
+  two54 = 0x1p54,	/* 0x4350000000000000 */
+  twom54 = 0x1p-54;	/* 0x3C90000000000000 */
 
 /*********************************************************************/
 /* An ultimate sqrt routine. Given an IEEE double machine number x   */
@@ -54,56 +52,53 @@ long double __ieee754_sqrtl(long double x)
   static const long double big = 134217728.0, big1 = 134217729.0;
   long double t,s,i;
   mynumber a,c;
-  int4 k, l, m;
-  int n;
+  uint64_t k, l;
+  int64_t m, n;
   double d;
 
   a.x=x;
-  k=a.i[0] & 0x7fffffff;
+  k=a.i[0] & INT64_C(0x7fffffffffffffff);
   /*----------------- 2^-1022  <= | x |< 2^1024  -----------------*/
-  if (k>0x000fffff && k<0x7ff00000) {
+  if (k>INT64_C(0x000fffff00000000) && k<INT64_C(0x7ff0000000000000)) {
     if (x < 0) return (big1-big1)/(big-big);
-    l = (k&0x001fffff)|0x3fe00000;
-    if (((a.i[2] & 0x7fffffff) | a.i[3]) != 0) {
-      n = (int) ((l - k) * 2) >> 21;
-      m = (a.i[2] >> 20) & 0x7ff;
+    l = (k&INT64_C(0x001fffffffffffff))|INT64_C(0x3fe0000000000000);
+    if ((a.i[1] & INT64_C(0x7fffffffffffffff)) != 0) {
+      n = (int64_t) ((l - k) * 2) >> 53;
+      m = (a.i[1] >> 52) & 0x7ff;
       if (m == 0) {
 	a.d[1] *= two54;
-	m = ((a.i[2] >> 20) & 0x7ff) - 54;
+	m = ((a.i[1] >> 52) & 0x7ff) - 54;
       }
       m += n;
-      if ((int) m > 0)
-	a.i[2] = (a.i[2] & 0x800fffff) | (m << 20);
-      else if ((int) m <= -54) {
-	a.i[2] &= 0x80000000;
-	a.i[3] = 0;
+      if (m > 0)
+	a.i[1] = (a.i[1] & INT64_C(0x800fffffffffffff)) | (m << 52);
+      else if (m <= -54) {
+	a.i[1] &= INT64_C(0x8000000000000000);
       } else {
 	m += 54;
-	a.i[2] = (a.i[2] & 0x800fffff) | (m << 20);
+	a.i[1] = (a.i[1] & INT64_C(0x800fffffffffffff)) | (m << 52);
 	a.d[1] *= twom54;
       }
     }
     a.i[0] = l;
     s = a.x;
     d = __ieee754_sqrt (a.d[0]);
-    c.i[0] = 0x20000000+((k&0x7fe00000)>>1);
+    c.i[0] = INT64_C(0x2000000000000000)+((k&INT64_C(0x7fe0000000000000))>>1);
     c.i[1] = 0;
-    c.i[2] = 0;
-    c.i[3] = 0;
     i = d;
     t = 0.5L * (i + s / i);
     i = 0.5L * (t + s / t);
     return c.x * i;
   }
   else {
-    if (k>=0x7ff00000) {
-      if (a.i[0] == 0xfff00000 && a.i[1] == 0)
+    if (k>=INT64_C(0x7ff0000000000000)) {
+      if (a.i[0] == INT64_C(0xfff0000000000000))
 	return (big1-big1)/(big-big); /* sqrt (-Inf) = NaN.  */
       return x; /* sqrt (NaN) = NaN, sqrt (+Inf) = +Inf.  */
     }
     if (x == 0) return x;
     if (x < 0) return (big1-big1)/(big-big);
-    return tm256.x*__ieee754_sqrtl(x*t512.x);
+    return tm256*__ieee754_sqrtl(x*t512);
   }
 }
 strong_alias (__ieee754_sqrtl, __sqrtl_finite)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cf141d9c91d34030b14138c0061966fc7417bb99

commit cf141d9c91d34030b14138c0061966fc7417bb99
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:37:02 2013 -0500

    PowerPC: floating point little-endian
    
    Rid ourselves of ieee854.

diff --git a/sysdeps/ieee754/ldbl-128ibm/ieee754.h b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
index 0778b1f..0c97a99 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ieee754.h
+++ b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
@@ -111,61 +111,6 @@ union ieee754_double
 #define IEEE754_DOUBLE_BIAS	0x3ff /* Added to exponent.  */
 
 
-union ieee854_long_double
-  {
-    long double d;
-
-    /* This is the IEEE 854 quad-precision format.  */
-    struct
-      {
-#if	__BYTE_ORDER == __BIG_ENDIAN
-	unsigned int negative:1;
-	unsigned int exponent:15;
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa0:16;
-	unsigned int mantissa1:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa3:32;
-#endif				/* Big endian.  */
-#if	__BYTE_ORDER == __LITTLE_ENDIAN
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa3:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa1:32;
-	unsigned int mantissa0:16;
-	unsigned int exponent:15;
-	unsigned int negative:1;
-#endif				/* Little endian.  */
-      } ieee;
-
-    /* This format makes it easier to see if a NaN is a signalling NaN.  */
-    struct
-      {
-#if	__BYTE_ORDER == __BIG_ENDIAN
-	unsigned int negative:1;
-	unsigned int exponent:15;
-	unsigned int quiet_nan:1;
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa0:15;
-	unsigned int mantissa1:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa3:32;
-#else
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa3:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa1:32;
-	unsigned int mantissa0:15;
-	unsigned int quiet_nan:1;
-	unsigned int exponent:15;
-	unsigned int negative:1;
-#endif
-      } ieee_nan;
-  };
-
-#define IEEE854_LONG_DOUBLE_BIAS 0x3fff /* Added to exponent.  */
-
-
 /* IBM extended format for long double.
 
    Each long double is made up of two IEEE doubles.  The value of the
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index d69d344..d3911c8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -2,7 +2,6 @@
 #error "Never use <math_ldbl.h> directly; include <math_private.h> instead."
 #endif
 
-#include <sysdeps/ieee754/ldbl-128/math_ldbl.h>
 #include <ieee754.h>
 #include <stdint.h>
 
@@ -11,7 +10,7 @@
    implicit bit) and 64 bits in *lo64.  */
 
 static inline void
-ldbl_extract_mantissa (uint64_t *hi64, uint64_t *lo64, int *exp, long double x)
+ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
 {
   /* We have 105 bits of mantissa plus one implicit digit.  Since
      106 bits are representable we use the first implicit digit for
@@ -89,7 +88,7 @@ ldbl_extract_mantissa (uint64_t *hi64, uint64_t *lo64, int *exp, long double x)
 }
 
 static inline long double
-ldbl_insert_mantissa (int sign, int exp, uint64_t hi64, uint64_t lo64)
+ldbl_insert_mantissa (int sign, int exp, int64_t hi64, uint64_t lo64)
 {
   union ibm_extended_long_double u;
   int expnt2;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=35d8bdfa6d9e8cd82795d0e3befabe2284d25dda

commit 35d8bdfa6d9e8cd82795d0e3befabe2284d25dda
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:33:30 2013 -0500

    PowerPC: floating point little-endian
    
    Another batch of ieee854 macros and union replacement.  These four
    files also have bugs fixed with this patch.  The fact that the two
    doubles in an IBM long double may have different signs means that
    negation and absolute value operations can't just twiddle one sign bit
    as you can with ieee864 style extended double.  fmodl, remainderl,
    erfl and erfcl all had errors of this type.  erfl also returned +1 for
    large magnitude negative input where it should return -1.  The hypotl
    error is innocuous since the value adjusted twice is only used as a
    flag.

diff --git a/math/libm-test.inc b/math/libm-test.inc
index fe85bb9..167992b 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -7821,6 +7821,11 @@ static const struct test_f_f_data erf_test_data[] =
     TEST_f_f (erf, 2.0L, 0.995322265018952734162069256367252929L),
     TEST_f_f (erf, 4.125L, 0.999999994576599200434933994687765914L),
     TEST_f_f (erf, 27.0L, 1.0L),
+    TEST_f_f (erf, -27.0L, -1.0L),
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 54
+    /* The input is not exactly representable as a double.  */
+    TEST_f_f (erf, -0x1.fffffffffffff8p-2L, -0.5204998778130465132916303345518417673509L),
+#endif
   };
 
 static void
@@ -7849,6 +7854,10 @@ static const struct test_f_f_data erfc_test_data[] =
     TEST_f_f (erfc, 0x1.ffa002p+2L, 1.233585992097580296336099501489175967033e-29L),
     TEST_f_f (erfc, 0x1.ffffc8p+2L, 1.122671365033056305522366683719541099329e-29L),
 #ifdef TEST_LDOUBLE
+# if LDBL_MANT_DIG >= 54
+    /* The input is not exactly representable as a double.  */
+    TEST_f_f (erfc, -0x1.fffffffffffff8p-2L, 1.52049987781304651329163033455184176735L),
+# endif
     /* The result can only be represented in long double.  */
 # if LDBL_MIN_10_EXP < -319
     TEST_f_f (erfc, 27.0L, 0.523704892378925568501606768284954709e-318L),
@@ -9347,6 +9356,13 @@ static const struct test_ff_f_data fmod_test_data[] =
 #if defined TEST_LDOUBLE && LDBL_MIN_EXP <= -16381
     TEST_ff_f (fmod, 0x0.fffffffffffffffep-16382L, 0x1p-16445L, plus_zero, NO_INEXACT_EXCEPTION),
 #endif
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 56
+    TEST_ff_f (fmod, -0x1.00000000000004p+0L, 0x1.fffffffffffff8p-1L, -0x1p-53L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, 0x1.fffffffffffffap-1L, 0x1.fffffffffffff8p-1L, 0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, -0x1.fffffffffffffap-1L, 0x1.fffffffffffff8p-1L, -0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, 0x1.fffffffffffffap-1L, -0x1.fffffffffffff8p-1L, 0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, -0x1.fffffffffffffap-1L, -0x1.fffffffffffff8p-1L, -0x1p-56L, NO_INEXACT_EXCEPTION),
+#endif
   };
 
 static void
@@ -12308,6 +12324,9 @@ static const struct test_ff_f_data remainder_test_data[] =
     TEST_ff_f (remainder, -1.625, -1.0, 0.375, NO_INEXACT_EXCEPTION),
     TEST_ff_f (remainder, 5.0, 2.0, 1.0, NO_INEXACT_EXCEPTION),
     TEST_ff_f (remainder, 3.0, 2.0, -1.0, NO_INEXACT_EXCEPTION),
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 56
+    TEST_ff_f (remainder, -0x1.80000000000002p1L, 2.0, 0x1.fffffffffffff8p-1L, NO_INEXACT_EXCEPTION),
+#endif
   };
 
 static void
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c b/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
index a60963c..ffb3bec 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
@@ -27,76 +27,65 @@ static const long double one = 1.0, Zero[] = {0.0, -0.0,};
 long double
 __ieee754_fmodl (long double x, long double y)
 {
-	int64_t n,hx,hy,hz,ix,iy,sx, i;
-	u_int64_t lx,ly,lz;
-	int temp;
+	int64_t hx, hy, hz, sx, sy;
+	uint64_t lx, ly, lz;
+	int n, ix, iy;
+	double xhi, xlo, yhi, ylo;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	ldbl_unpack (y, &yhi, &ylo);
+	EXTRACT_WORDS64 (hy, yhi);
+	EXTRACT_WORDS64 (ly, ylo);
 	sx = hx&0x8000000000000000ULL;		/* sign of x */
-	hx ^=sx;				/* |x| */
-	hy &= 0x7fffffffffffffffLL;		/* |y| */
+	hx ^= sx;				/* |x| */
+	sy = hy&0x8000000000000000ULL;		/* sign of y */
+	hy ^= sy;				/* |y| */
 
     /* purge off exception values */
-	if(__builtin_expect((hy|(ly&0x7fffffffffffffff))==0 ||
+	if(__builtin_expect(hy==0 ||
 			    (hx>=0x7ff0000000000000LL)|| /* y=0,or x not finite */
 			    (hy>0x7ff0000000000000LL),0))	/* or y is NaN */
 	    return (x*y)/(x*y);
 	if(__builtin_expect(hx<=hy,0)) {
-	    if((hx<hy)||(lx<ly)) return x;	/* |x|<|y| return x */
-	    if(lx==ly)
-		return Zero[(u_int64_t)sx>>63];	/* |x|=|y| return x*0*/
+	    if (hx < hy
+		|| (((ly ^ sy) & 0x8000000000000000LL) == 0
+		    && (int64_t) (lx ^ sx) < (int64_t) (ly ^ sy))
+		|| (((lx ^ sx) & 0x8000000000000000LL) != 0
+		    && (int64_t) (lx ^ sx) > (int64_t) (ly ^ sy)))
+		return x;			/* |x|<|y| return x */
+	    if ((lx ^ sx) == (ly ^ sy))
+		return Zero[(uint64_t)sx>>63];	/* |x|=|y| return x*0*/
 	}
 
-    /* determine ix = ilogb(x) */
-	if(__builtin_expect(hx<0x0010000000000000LL,0)) {	/* subnormal x */
-	    if(hx==0) {
-		for (ix = -1043, i=lx; i>0; i<<=1) ix -=1;
-	    } else {
-		for (ix = -1022, i=(hx<<11); i>0; i<<=1) ix -=1;
-	    }
-	} else ix = (hx>>52)-0x3ff;
-
-    /* determine iy = ilogb(y) */
-	if(__builtin_expect(hy<0x0010000000000000LL,0)) {	/* subnormal y */
-	    if(hy==0) {
-		for (iy = -1043, i=ly; i>0; i<<=1) iy -=1;
-	    } else {
-		for (iy = -1022, i=(hy<<11); i>0; i<<=1) iy -=1;
-	    }
-	} else iy = (hy>>52)-0x3ff;
-
     /* Make the IBM extended format 105 bit mantissa look like the ieee854 112
        bit mantissa so the following operations will give the correct
        result.  */
-	ldbl_extract_mantissa(&hx, &lx, &temp, x);
-	ldbl_extract_mantissa(&hy, &ly, &temp, y);
+	ldbl_extract_mantissa(&hx, &lx, &ix, x);
+	ldbl_extract_mantissa(&hy, &ly, &iy, y);
 
-    /* set up {hx,lx}, {hy,ly} and align y to x */
-	if(__builtin_expect(ix >= -1022, 1))
-	    hx = 0x0001000000000000LL|(0x0000ffffffffffffLL&hx);
-	else {		/* subnormal x, shift x to normal */
-	    n = -1022-ix;
-	    if(n<=63) {
-		hx = (hx<<n)|(lx>>(64-n));
-		lx <<= n;
-	    } else {
-		hx = lx<<(n-64);
-		lx = 0;
-	    }
-	}
-	if(__builtin_expect(iy >= -1022, 1))
-	    hy = 0x0001000000000000LL|(0x0000ffffffffffffLL&hy);
-	else {		/* subnormal y, shift y to normal */
-	    n = -1022-iy;
-	    if(n<=63) {
-		hy = (hy<<n)|(ly>>(64-n));
-		ly <<= n;
-	    } else {
-		hy = ly<<(n-64);
-		ly = 0;
-	    }
-	}
+	if (__builtin_expect (ix == -IEEE754_DOUBLE_BIAS, 0))
+	  {
+	    /* subnormal x, shift x to normal.  */
+	    while ((hx & (1LL << 48)) == 0)
+	      {
+		hx = (hx << 1) | (lx >> 63);
+		lx = lx << 1;
+		ix -= 1;
+	      }
+	  }
+
+	if (__builtin_expect (iy == -IEEE754_DOUBLE_BIAS, 0))
+	  {
+	    /* subnormal y, shift y to normal.  */
+	    while ((hy & (1LL << 48)) == 0)
+	      {
+		hy = (hy << 1) | (ly >> 63);
+		ly = ly << 1;
+		iy -= 1;
+	      }
+	  }
 
     /* fix point fmod */
 	n = ix - iy;
@@ -104,7 +93,7 @@ __ieee754_fmodl (long double x, long double y)
 	    hz=hx-hy;lz=lx-ly; if(lx<ly) hz -= 1;
 	    if(hz<0){hx = hx+hx+(lx>>63); lx = lx+lx;}
 	    else {
-		if((hz|(lz&0x7fffffffffffffff))==0)		/* return sign(x)*0 */
+		if((hz|lz)==0)		/* return sign(x)*0 */
 		    return Zero[(u_int64_t)sx>>63];
 		hx = hz+hz+(lz>>63); lx = lz+lz;
 	    }
@@ -113,7 +102,7 @@ __ieee754_fmodl (long double x, long double y)
 	if(hz>=0) {hx=hz;lx=lz;}
 
     /* convert back to floating value and restore the sign */
-	if((hx|(lx&0x7fffffffffffffff))==0)			/* return sign(x)*0 */
+	if((hx|lx)==0)			/* return sign(x)*0 */
 	    return Zero[(u_int64_t)sx>>63];
 	while(hx<0x0001000000000000LL) {	/* normalize x */
 	    hx = hx+hx+(lx>>63); lx = lx+lx;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c b/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
index 768bd3b..3b07a47 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
@@ -45,76 +45,84 @@
 #include <math.h>
 #include <math_private.h>
 
-static const long double two600 = 0x1.0p+600L;
-static const long double two1022 = 0x1.0p+1022L;
-
 long double
 __ieee754_hypotl(long double x, long double y)
 {
-	long double a,b,t1,t2,y1,y2,w,kld;
+	long double a,b,a1,a2,b1,b2,w,kld;
 	int64_t j,k,ha,hb;
+	double xhi, yhi, hi, lo;
 
-	GET_LDOUBLE_MSW64(ha,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ha, xhi);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hb, yhi);
 	ha &= 0x7fffffffffffffffLL;
-	GET_LDOUBLE_MSW64(hb,y);
 	hb &= 0x7fffffffffffffffLL;
 	if(hb > ha) {a=y;b=x;j=ha; ha=hb;hb=j;} else {a=x;b=y;}
 	a = fabsl(a);	/* a <- |a| */
 	b = fabsl(b);	/* b <- |b| */
-	if((ha-hb)>0x780000000000000LL) {return a+b;} /* x/y > 2**120 */
+	if((ha-hb)>0x0780000000000000LL) {return a+b;} /* x/y > 2**120 */
 	k=0;
 	kld = 1.0L;
 	if(ha > 0x5f30000000000000LL) {	/* a>2**500 */
 	   if(ha >= 0x7ff0000000000000LL) {	/* Inf or NaN */
-	       u_int64_t low;
 	       w = a+b;			/* for sNaN */
-	       GET_LDOUBLE_LSW64(low,a);
-	       if(((ha&0xfffffffffffffLL)|(low&0x7fffffffffffffffLL))==0)
+	       if(ha == 0x7ff0000000000000LL)
 		 w = a;
-	       GET_LDOUBLE_LSW64(low,b);
-	       if(((hb^0x7ff0000000000000LL)|(low&0x7fffffffffffffffLL))==0)
+	       if(hb == 0x7ff0000000000000LL)
 		 w = b;
 	       return w;
 	   }
 	   /* scale a and b by 2**-600 */
-	   ha -= 0x2580000000000000LL; hb -= 0x2580000000000000LL; k += 600;
-	   a /= two600;
-	   b /= two600;
-	   k += 600;
-	   kld = two600;
+	   a *= 0x1p-600L;
+	   b *= 0x1p-600L;
+	   k = 600;
+	   kld = 0x1p+600L;
 	}
-	if(hb < 0x23d0000000000000LL) {	/* b < 2**-450 */
+	else if(hb < 0x23d0000000000000LL) {	/* b < 2**-450 */
 	    if(hb <= 0x000fffffffffffffLL) {	/* subnormal b or 0 */
-		u_int64_t low;
-		GET_LDOUBLE_LSW64(low,b);
-		if((hb|(low&0x7fffffffffffffffLL))==0) return a;
-		t1=two1022;	/* t1=2^1022 */
-		b *= t1;
-		a *= t1;
-		k -= 1022;
-		kld = kld / two1022;
+		if(hb==0) return a;
+		a *= 0x1p+1022L;
+		b *= 0x1p+1022L;
+		k = -1022;
+		kld = 0x1p-1022L;
 	    } else {		/* scale a and b by 2^600 */
-		ha += 0x2580000000000000LL;	/* a *= 2^600 */
-		hb += 0x2580000000000000LL;	/* b *= 2^600 */
-		k -= 600;
-		a *= two600;
-		b *= two600;
-		kld = kld / two600;
+		a *= 0x1p+600L;
+		b *= 0x1p+600L;
+		k = -600;
+		kld = 0x1p-600L;
 	    }
 	}
     /* medium size a and b */
 	w = a-b;
 	if (w>b) {
-	    SET_LDOUBLE_WORDS64(t1,ha,0);
-	    t2 = a-t1;
-	    w  = __ieee754_sqrtl(t1*t1-(b*(-b)-t2*(a+t1)));
+	    ldbl_unpack (a, &hi, &lo);
+	    a1 = hi;
+	    a2 = lo;
+	    /* a*a + b*b
+	       = (a1+a2)*a + b*b
+	       = a1*a + a2*a + b*b
+	       = a1*(a1+a2) + a2*a + b*b
+	       = a1*a1 + a1*a2 + a2*a + b*b
+	       = a1*a1 + a2*(a+a1) + b*b  */
+	    w  = __ieee754_sqrtl(a1*a1-(b*(-b)-a2*(a+a1)));
 	} else {
 	    a  = a+a;
-	    SET_LDOUBLE_WORDS64(y1,hb,0);
-	    y2 = b - y1;
-	    SET_LDOUBLE_WORDS64(t1,ha+0x0010000000000000LL,0);
-	    t2 = a - t1;
-	    w  = __ieee754_sqrtl(t1*y1-(w*(-w)-(t1*y2+t2*b)));
+	    ldbl_unpack (b, &hi, &lo);
+	    b1 = hi;
+	    b2 = lo;
+	    ldbl_unpack (a, &hi, &lo);
+	    a1 = hi;
+	    a2 = lo;
+	    /* a*a + b*b
+	       = a*a + (a-b)*(a-b) - (a-b)*(a-b) + b*b
+	       = a*a + w*w  - (a*a - 2*a*b + b*b) + b*b
+	       = w*w + 2*a*b
+	       = w*w + (a1+a2)*b
+	       = w*w + a1*b + a2*b
+	       = w*w + a1*(b1+b2) + a2*b
+	       = w*w + a1*b1 + a1*b2 + a2*b  */
+	    w  = __ieee754_sqrtl(a1*b1-(w*(-w)-(a1*b2+a2*b)));
 	}
 	if(k!=0)
 	    return w*kld;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c b/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
index 67d7db7..800416f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
@@ -33,18 +33,22 @@ __ieee754_remainderl(long double x, long double p)
 	int64_t hx,hp;
 	u_int64_t sx,lx,lp;
 	long double p_half;
+	double xhi, xlo, phi, plo;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hp,lp,p);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	ldbl_unpack (p, &phi, &plo);
+	EXTRACT_WORDS64 (hp, phi);
+	EXTRACT_WORDS64 (lp, plo);
 	sx = hx&0x8000000000000000ULL;
 	hp &= 0x7fffffffffffffffLL;
 	hx &= 0x7fffffffffffffffLL;
 
     /* purge off exception values */
-	if((hp|(lp&0x7fffffffffffffff))==0) return (x*p)/(x*p);	/* p = 0 */
+	if(hp==0) return (x*p)/(x*p);	/* p = 0 */
 	if((hx>=0x7ff0000000000000LL)||			/* x not finite */
-	  ((hp>=0x7ff0000000000000LL)&&			/* p is NaN */
-	  (((hp-0x7ff0000000000000LL)|lp)!=0)))
+	   (hp>0x7ff0000000000000LL))			/* p is NaN */
 	    return (x*p)/(x*p);
 
 
@@ -64,8 +68,8 @@ __ieee754_remainderl(long double x, long double p)
 		if(x>=p_half) x -= p;
 	    }
 	}
-	GET_LDOUBLE_MSW64(hx,x);
-	SET_LDOUBLE_MSW64(x,hx^sx);
+	if (sx)
+	  x = -x;
 	return x;
 }
 strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_erfl.c b/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
index 6a4475e..c861c65 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
@@ -760,16 +760,16 @@ long double
 __erfl (long double x)
 {
   long double a, y, z;
-  int32_t i, ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t i, ix, hx;
+  double xhi;
 
-  u.value = x;
-  sign = u.parts32.w0;
-  ix = sign & 0x7fffffff;
+  xhi = ldbl_high (x);
+  GET_HIGH_WORD (hx, xhi);
+  ix = hx & 0x7fffffff;
 
   if (ix >= 0x7ff00000)
     {				/* erf(nan)=nan */
-      i = ((sign & 0xfff00000) >> 31) << 1;
+      i = ((uint32_t) hx >> 31) << 1;
       return (long double) (1 - i) + one / x;	/* erf(+-inf)=+-1 */
     }
 
@@ -778,7 +778,7 @@ __erfl (long double x)
       if (ix >= 0x4039A0DE)
 	{
 	/* __erfcl (x) underflows if x > 25.6283 */
-	  if (sign)
+	  if ((hx & 0x80000000) == 0)
 	    return one-tiny;
 	  else
 	    return tiny-one;
@@ -789,8 +789,9 @@ __erfl (long double x)
 	  return (one - y);
 	}
     }
-  u.parts32.w0 = ix;
-  a = u.value;
+  a = x;
+  if ((hx & 0x80000000) != 0)
+    a = -a;
   z = x * x;
   if (ix < 0x3fec0000)  /* a < 0.875 */
     {
@@ -814,7 +815,7 @@ __erfl (long double x)
       y = erf_const + neval (a, TN2, NTN2) / deval (a, TD2, NTD2);
     }
 
-  if (sign & 0x80000000) /* x < 0 */
+  if (hx & 0x80000000) /* x < 0 */
     y = -y;
   return( y );
 }
@@ -824,18 +825,18 @@ long double
 __erfcl (long double x)
 {
   long double y, z, p, r;
-  int32_t i, ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t i, ix;
+  uint32_t hx;
+  double xhi;
 
-  u.value = x;
-  sign = u.parts32.w0;
-  ix = sign & 0x7fffffff;
-  u.parts32.w0 = ix;
+  xhi = ldbl_high (x);
+  GET_HIGH_WORD (hx, xhi);
+  ix = hx & 0x7fffffff;
 
   if (ix >= 0x7ff00000)
     {				/* erfc(nan)=nan */
       /* erfc(+-inf)=0,2 */
-      return (long double) (((u_int32_t) sign >> 31) << 1) + one / x;
+      return (long double) ((hx >> 31) << 1) + one / x;
     }
 
   if (ix < 0x3fd00000) /* |x| <1/4 */
@@ -846,7 +847,8 @@ __erfcl (long double x)
     }
   if (ix < 0x3ff40000) /* 1.25 */
     {
-      x = u.value;
+      if ((hx & 0x80000000) != 0)
+	x = -x;
       i = 8.0 * x;
       switch (i)
 	{
@@ -891,7 +893,7 @@ __erfcl (long double x)
 	  y += C20a;
 	  break;
 	}
-      if (sign & 0x80000000)
+      if (hx & 0x80000000)
 	y = 2.0L - y;
       return y;
     }
@@ -899,10 +901,11 @@ __erfcl (long double x)
   if (ix < 0x405ac000)
     {
       /* x < -9 */
-      if ((ix >= 0x40220000) && (sign & 0x80000000))
+      if (hx >= 0xc0220000)
 	return two - tiny;
 
-      x = fabsl (x);
+      if ((hx & 0x80000000) != 0)
+	x = -x;
       z = one / (x * x);
       i = 8.0 / x;
       switch (i)
@@ -933,21 +936,17 @@ __erfcl (long double x)
 	  p = neval (z, RNr8, NRNr8) / deval (z, RDr8, NRDr8);
 	  break;
 	}
-      u.value = x;
-      u.parts32.w3 = 0;
-      u.parts32.w2 = 0;
-      u.parts32.w1 &= 0xf8000000;
-      z = u.value;
+      z = (float) x;
       r = __ieee754_expl (-z * z - 0.5625) *
 	__ieee754_expl ((z - x) * (z + x) + p);
-      if ((sign & 0x80000000) == 0)
+      if ((hx & 0x80000000) == 0)
 	return r / x;
       else
 	return two - r / x;
     }
   else
     {
-      if ((sign & 0x80000000) == 0)
+      if ((hx & 0x80000000) == 0)
 	return tiny * tiny;
       else
 	return two - tiny;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e85cffdde0679dfa882532196462a381fff11ca9

commit e85cffdde0679dfa882532196462a381fff11ca9
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:33:16 2013 -0500

    PowerPC: floating point little-endian
    
    Further replacement of ieee854 macros and unions.  These files also
    have some optimizations for comparison against 0.0L, infinity and nan.
    Since the ABI specifies that the high double of an IBM long double
    pair is the value rounded to double, a high double of 0.0 means the
    low double must also be 0.0.  The ABI also says that infinity and
    nan are encoded in the high double, with the low double unspecified.
    This means that tests for 0.0L, ±Infinity and ±NaN need only check
    the high double.
    
    The k_tanl.c change introduces a regression in test-ldouble and
    test-ildoubl.
    +Failure: Test: tan_towardzero (2)
    +Result:
    + is:         -2.18503986326151899164e+00  -0x1.17af62e0950f83b50990p+1
    + should be:  -2.18503986326151899164e+00  -0x1.17af62e0950f83b50990p+1
    + difference:  4.93038065763132378382e-32
    0x1.00000000000000000000p-104
    + ulp       :  1.0000
    + max.ulp   :  0.0000
    
    This is not due to an error in the patch but because addition of IBM
    long double is not commutative.  The final addition in the following
    line happens to passed to __gcc_qadd with the two long double values
    swapped compared to the previous version.
         return z1 + z * (s + z1 * v);
    gcc also swaps other addition and multiplication operands too, but
    this particular one happens to introduce the 1ulp rounding error.

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c b/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
index 3e05355..b625323 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
@@ -56,11 +56,15 @@ __ieee754_atan2l(long double y, long double x)
 {
 	long double z;
 	int64_t k,m,hx,hy,ix,iy;
-	u_int64_t lx,ly;
+	uint64_t lx;
+	double xhi, xlo, yhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	ix = hx&0x7fffffffffffffffLL;
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	iy = hy&0x7fffffffffffffffLL;
 	if(((ix)>0x7ff0000000000000LL)||
 	   ((iy)>0x7ff0000000000000LL))	/* x or y is NaN */
@@ -70,7 +74,7 @@ __ieee754_atan2l(long double y, long double x)
 	m = ((hy>>63)&1)|((hx>>62)&2);	/* 2*sign(x)+sign(y) */
 
     /* when y = 0 */
-	if((iy|(ly&0x7fffffffffffffffLL))==0) {
+	if(iy==0) {
 	    switch(m) {
 		case 0:
 		case 1: return y;	/* atan(+-0,+anything)=+-0 */
@@ -79,7 +83,7 @@ __ieee754_atan2l(long double y, long double x)
 	    }
 	}
     /* when x = 0 */
-	if((ix|(lx&0x7fffffffffffffff))==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+	if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
 
     /* when x is INF */
 	if(ix==0x7ff0000000000000LL) {
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c b/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
index 90d8e3f..84c13de 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
@@ -122,11 +122,12 @@ long double
 __ieee754_gammal_r (long double x, int *signgamp)
 {
   int64_t hx;
-  u_int64_t lx;
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
 
-  if (((hx | lx) & 0x7fffffffffffffffLL) == 0)
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     {
       /* Return value for x == 0 is Inf with divide by zero exception.  */
       *signgamp = 0;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c b/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
index 55f87ed..aeace7c 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
@@ -31,26 +31,24 @@ static char rcsid[] = "$NetBSD: $";
 
 int __ieee754_ilogbl(long double x)
 {
-	int64_t hx,lx;
+	int64_t hx;
 	int ix;
+	double xhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	hx &= 0x7fffffffffffffffLL;
 	if(hx <= 0x0010000000000000LL) {
-	    if((hx|(lx&0x7fffffffffffffffLL))==0)
+	    if(hx==0)
 		return FP_ILOGB0;	/* ilogbl(0) = FP_ILOGB0 */
 	    else			/* subnormal x */
-		if(hx==0) {
-		    for (ix = -1043; lx>0; lx<<=1) ix -=1;
-		} else {
-		    for (ix = -1022, hx<<=11; hx>0; hx<<=1) ix -=1;
-		}
+		for (ix = -1022, hx<<=11; hx>0; hx<<=1) ix -=1;
 	    return ix;
 	}
 	else if (hx<0x7ff0000000000000LL) return (hx>>52)-0x3ff;
 	else if (FP_ILOGBNAN != INT_MAX) {
 	    /* ISO C99 requires ilogbl(+-Inf) == INT_MAX.  */
-	    if (((hx^0x7ff0000000000000LL)|lx) == 0)
+	    if (hx==0x7ff0000000000000LL)
 		return INT_MAX;
 	}
 	return FP_ILOGBNAN;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_jnl.c b/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
index 40012e4..817977d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
@@ -70,26 +70,25 @@ static const long double
 long double
 __ieee754_jnl (int n, long double x)
 {
-  u_int32_t se;
+  uint32_t se, lx;
   int32_t i, ix, sgn;
   long double a, b, temp, di;
   long double z, w;
-  ieee854_long_double_shape_type u;
+  double xhi;
 
 
   /* J(-n,x) = (-1)^n * J(n, x), J(n, -x) = (-1)^n * J(n, x)
    * Thus, J(-n,x) = J(n,-x)
    */
 
-  u.value = x;
-  se = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (se, lx, xhi);
   ix = se & 0x7fffffff;
 
   /* if J(n,NaN) is NaN */
   if (ix >= 0x7ff00000)
     {
-      if ((u.parts32.w0 & 0xfffff) | u.parts32.w1
-	  | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3)
+      if (((ix - 0x7ff00000) | lx) != 0)
 	return x + x;
     }
 
@@ -298,21 +297,20 @@ strong_alias (__ieee754_jnl, __jnl_finite)
 long double
 __ieee754_ynl (int n, long double x)
 {
-  u_int32_t se;
+  uint32_t se, lx;
   int32_t i, ix;
   int32_t sign;
   long double a, b, temp;
-  ieee854_long_double_shape_type u;
+  double xhi;
 
-  u.value = x;
-  se = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (se, lx, xhi);
   ix = se & 0x7fffffff;
 
   /* if Y(n,NaN) is NaN */
   if (ix >= 0x7ff00000)
     {
-      if ((u.parts32.w0 & 0xfffff) | u.parts32.w1
-	  | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3)
+      if (((ix - 0x7ff00000) | lx) != 0)
 	return x + x;
     }
   if (x <= 0.0L)
@@ -377,14 +375,16 @@ __ieee754_ynl (int n, long double x)
       a = __ieee754_y0l (x);
       b = __ieee754_y1l (x);
       /* quit if b is -inf */
-      u.value = b;
-      se = u.parts32.w0 & 0xfff00000;
+      xhi = ldbl_high (b);
+      GET_HIGH_WORD (se, xhi);
+      se &= 0xfff00000;
       for (i = 1; i < n && se != 0xfff00000; i++)
 	{
 	  temp = b;
 	  b = ((long double) (i + i) / x) * b - a;
-	  u.value = b;
-	  se = u.parts32.w0 & 0xfff00000;
+	  xhi = ldbl_high (b);
+	  GET_HIGH_WORD (se, xhi);
+	  se &= 0xfff00000;
 	  a = temp;
 	}
     }
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_log10l.c b/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
index fae774c..1a6a4a0 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
@@ -182,11 +182,13 @@ __ieee754_log10l (long double x)
   long double z;
   long double y;
   int e;
-  int64_t hx, lx;
+  int64_t hx;
+  double xhi;
 
 /* Test for domain */
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  if (((hx & 0x7fffffffffffffffLL) | (lx & 0x7fffffffffffffffLL)) == 0)
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     return (-1.0L / (x - x));
   if (hx < 0)
     return (x - x) / (x - x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_logl.c b/sysdeps/ieee754/ldbl-128ibm/e_logl.c
index 15b5edf..b7db2b9 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_logl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_logl.c
@@ -188,18 +188,20 @@ static const long double
 long double
 __ieee754_logl(long double x)
 {
-  long double z, y, w;
-  ieee854_long_double_shape_type u, t;
+  long double z, y, w, t;
   unsigned int m;
   int k, e;
+  double xhi;
+  uint32_t hx, lx;
 
-  u.value = x;
-  m = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (hx, lx, xhi);
+  m = hx;
 
   /* Check for IEEE special cases.  */
   k = m & 0x7fffffff;
   /* log(0) = -infinity. */
-  if ((k | u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3) == 0)
+  if ((k | lx) == 0)
     {
       return -0.5L / ZERO;
     }
@@ -219,7 +221,7 @@ __ieee754_logl(long double x)
     {
       z = x - 1.0L;
       k = 64;
-      t.value  = 1.0L;
+      t = 1.0L;
       e = 0;
     }
   else
@@ -236,10 +238,8 @@ __ieee754_logl(long double x)
 	  k = (m - 0xff000) >> 13;
 	  /* t is the argument 0.5 + (k+26)/128
 	     of the nearest item to u in the lookup table.  */
-	  t.parts32.w0 = 0x3ff00000 + (k << 13);
-	  t.parts32.w1 = 0;
-	  t.parts32.w2 = 0;
-	  t.parts32.w3 = 0;
+	  INSERT_WORDS (xhi, 0x3ff00000 + (k << 13), 0);
+	  t = xhi;
 	  w0 += 0x100000;
 	  e -= 1;
 	  k += 64;
@@ -247,17 +247,15 @@ __ieee754_logl(long double x)
       else
 	{
 	  k = (m - 0xfe000) >> 14;
-	  t.parts32.w0 = 0x3fe00000 + (k << 14);
-	  t.parts32.w1 = 0;
-	  t.parts32.w2 = 0;
-	  t.parts32.w3 = 0;
+	  INSERT_WORDS (xhi, 0x3fe00000 + (k << 14), 0);
+	  t = xhi;
 	}
-      u.value = __scalbnl (u.value, ((int) ((w0 - u.parts32.w0) * 2)) >> 21);
+      x = __scalbnl (x, ((int) ((w0 - hx) * 2)) >> 21);
       /* log(u) = log( t u/t ) = log(t) + log(u/t)
 	 log(t) is tabulated in the lookup table.
 	 Express log(u/t) = log(1+z),  where z = u/t - 1 = (u-t)/t.
 	 cf. Cody & Waite. */
-      z = (u.value - t.value) / t.value;
+      z = (x - t) / t;
     }
   /* Series expansion of log(1+z).  */
   w = z * z;
@@ -284,7 +282,7 @@ __ieee754_logl(long double x)
   y += e * ln2b;  /* Base 2 exponent offset times ln(2).  */
   y += z;
   y += logtbl[k-26]; /* log(t) - (t-1) */
-  y += (t.value - 1.0L);
+  y += (t - 1.0L);
   y += e * ln2a;
   return y;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_powl.c b/sysdeps/ieee754/ldbl-128ibm/e_powl.c
index 8bd35d0..c942f2f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_powl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_powl.c
@@ -151,37 +151,32 @@ __ieee754_powl (long double x, long double y)
   long double y1, t1, t2, r, s, t, u, v, w;
   long double s2, s_h, s_l, t_h, t_l, ay;
   int32_t i, j, k, yisint, n;
-  u_int32_t ix, iy;
-  int32_t hx, hy;
-  ieee854_long_double_shape_type o, p, q;
+  uint32_t ix, iy;
+  int32_t hx, hy, hax;
+  double ohi, xhi, xlo, yhi, ylo;
+  uint32_t lx, ly, lj;
 
-  p.value = x;
-  hx = p.parts32.w0;
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS (hx, lx, xhi);
   ix = hx & 0x7fffffff;
 
-  q.value = y;
-  hy = q.parts32.w0;
+  ldbl_unpack (y, &yhi, &ylo);
+  EXTRACT_WORDS (hy, ly, yhi);
   iy = hy & 0x7fffffff;
 
-
   /* y==zero: x**0 = 1 */
-  if ((iy | q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if ((iy | ly) == 0)
     return one;
 
   /* 1.0**y = 1; -1.0**+-Inf = 1 */
   if (x == one)
     return one;
-  if (x == -1.0L && iy == 0x7ff00000
-      && (q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if (x == -1.0L && ((iy - 0x7ff00000) | ly) == 0)
     return one;
 
   /* +-NaN return x+y */
-  if ((ix > 0x7ff00000)
-      || ((ix == 0x7ff00000)
-	  && ((p.parts32.w1 | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) != 0))
-      || (iy > 0x7ff00000)
-      || ((iy == 0x7ff00000)
-	  && ((q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) != 0)))
+  if ((ix >= 0x7ff00000 && ((ix - 0x7ff00000) | lx) != 0)
+      || (iy >= 0x7ff00000 && ((iy - 0x7ff00000) | ly) != 0))
     return x + y;
 
   /* determine if y is an odd int when x < 0
@@ -192,7 +187,10 @@ __ieee754_powl (long double x, long double y)
   yisint = 0;
   if (hx < 0)
     {
-      if ((q.parts32.w2 & 0x7fffffff) >= 0x43400000)	/* Low part >= 2^53 */
+      uint32_t low_ye;
+
+      GET_HIGH_WORD (low_ye, ylo);
+      if ((low_ye & 0x7fffffff) >= 0x43400000)	/* Low part >= 2^53 */
 	yisint = 2;		/* even integer y */
       else if (iy >= 0x3ff00000)	/* 1.0 */
 	{
@@ -207,42 +205,43 @@ __ieee754_powl (long double x, long double y)
 	}
     }
 
+  ax = fabsl (x);
+
   /* special value of y */
-  if ((q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if (ly == 0)
     {
-      if (iy == 0x7ff00000 && q.parts32.w1 == 0)	/* y is +-inf */
+      if (iy == 0x7ff00000)	/* y is +-inf */
 	{
-	  if (((ix - 0x3ff00000) | p.parts32.w1
-	       | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) == 0)
-	    return y - y;	/* inf**+-1 is NaN */
-	  else if (ix > 0x3ff00000 || fabsl (x) > 1.0L)
+	  if (ax > one)
 	    /* (|x|>1)**+-inf = inf,0 */
 	    return (hy >= 0) ? y : zero;
 	  else
 	    /* (|x|<1)**-,+inf = inf,0 */
 	    return (hy < 0) ? -y : zero;
 	}
-      if (iy == 0x3ff00000)
-	{			/* y is  +-1 */
-	  if (hy < 0)
-	    return one / x;
-	  else
-	    return x;
-	}
-      if (hy == 0x40000000)
-	return x * x;		/* y is  2 */
-      if (hy == 0x3fe00000)
-	{			/* y is  0.5 */
-	  if (hx >= 0)		/* x >= +0 */
-	    return __ieee754_sqrtl (x);
+      if (ylo == 0.0)
+	{
+	  if (iy == 0x3ff00000)
+	    {			/* y is  +-1 */
+	      if (hy < 0)
+		return one / x;
+	      else
+		return x;
+	    }
+	  if (hy == 0x40000000)
+	    return x * x;		/* y is  2 */
+	  if (hy == 0x3fe00000)
+	    {			/* y is  0.5 */
+	      if (hx >= 0)		/* x >= +0 */
+		return __ieee754_sqrtl (x);
+	    }
 	}
     }
 
-  ax = fabsl (x);
   /* special value of x */
-  if ((p.parts32.w1 | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) == 0)
+  if (lx == 0)
     {
-      if (ix == 0x7ff00000 || ix == 0 || ix == 0x3ff00000)
+      if (ix == 0x7ff00000 || ix == 0 || (ix == 0x3ff00000 && xlo == 0.0))
 	{
 	  z = ax;		/*x is +-0,+-inf,+-1 */
 	  if (hy < 0)
@@ -294,8 +293,8 @@ __ieee754_powl (long double x, long double y)
     {
       ax *= two113;
       n -= 113;
-      o.value = ax;
-      ix = o.parts32.w0;
+      ohi = ldbl_high (ax);
+      GET_HIGH_WORD (ix, ohi);
     }
   n += ((ix) >> 20) - 0x3ff;
   j = ix & 0x000fffff;
@@ -312,26 +311,19 @@ __ieee754_powl (long double x, long double y)
       ix -= 0x00100000;
     }
 
-  o.value = ax;
-  o.value = __scalbnl (o.value, ((int) ((ix - o.parts32.w0) * 2)) >> 21);
-  ax = o.value;
+  ohi = ldbl_high (ax);
+  GET_HIGH_WORD (hax, ohi);
+  ax = __scalbnl (ax, ((int) ((ix - hax) * 2)) >> 21);
 
   /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
   u = ax - bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
   v = one / (ax + bp[k]);
   s = u * v;
-  s_h = s;
+  s_h = ldbl_high (s);
 
-  o.value = s_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  s_h = o.value;
   /* t_h=ax+bp[k] High */
   t_h = ax + bp[k];
-  o.value = t_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t_h = o.value;
+  t_h = ldbl_high (t_h);
   t_l = ax - (t_h - bp[k]);
   s_l = v * ((u - s_h * t_h) - s_h * t_l);
   /* compute log(ax) */
@@ -342,30 +334,21 @@ __ieee754_powl (long double x, long double y)
   r += s_l * (s_h + s);
   s2 = s_h * s_h;
   t_h = 3.0 + s2 + r;
-  o.value = t_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t_h = o.value;
+  t_h = ldbl_high (t_h);
   t_l = r - ((t_h - 3.0) - s2);
   /* u+v = s*(1+...) */
   u = s_h * t_h;
   v = s_l * t_h + t_l * s;
   /* 2/(3log2)*(s+...) */
   p_h = u + v;
-  o.value = p_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  p_h = o.value;
+  p_h = ldbl_high (p_h);
   p_l = v - (p_h - u);
   z_h = cp_h * p_h;		/* cp_h+cp_l = 2/(3*log2) */
   z_l = cp_l * p_h + p_l * cp + dp_l[k];
   /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
   t = (long double) n;
   t1 = (((z_h + z_l) + dp_h[k]) + t);
-  o.value = t1;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t1 = o.value;
+  t1 = ldbl_high (t1);
   t2 = z_l - (((t1 - t) - dp_h[k]) - z_h);
 
   /* s (sign of result -ve**odd) = -1 else = 1 */
@@ -374,21 +357,16 @@ __ieee754_powl (long double x, long double y)
     s = -one;			/* (-ve)**(odd int) */
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-  y1 = y;
-  o.value = y1;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  y1 = o.value;
+  y1 = ldbl_high (y);
   p_l = (y - y1) * t1 + y * t2;
   p_h = y1 * t1;
   z = p_l + p_h;
-  o.value = z;
-  j = o.parts32.w0;
+  ohi = ldbl_high (z);
+  EXTRACT_WORDS (j, lj, ohi);
   if (j >= 0x40d00000) /* z >= 16384 */
     {
       /* if z > 16384 */
-      if (((j - 0x40d00000) | o.parts32.w1
-	| (o.parts32.w2 & 0x7fffffff) | o.parts32.w3) != 0)
+      if (((j - 0x40d00000) | lj) != 0)
 	return s * huge * huge;	/* overflow */
       else
 	{
@@ -399,8 +377,7 @@ __ieee754_powl (long double x, long double y)
   else if ((j & 0x7fffffff) >= 0x40d01b90)	/* z <= -16495 */
     {
       /* z < -16495 */
-      if (((j - 0xc0d01bc0) | o.parts32.w1
-	 | (o.parts32.w2 & 0x7fffffff) | o.parts32.w3) != 0)
+      if (((j - 0xc0d01bc0) | lj) != 0)
 	return s * tiny * tiny;	/* underflow */
       else
 	{
@@ -419,10 +396,7 @@ __ieee754_powl (long double x, long double y)
       p_h -= t;
     }
   t = p_l + p_h;
-  o.value = t;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t = o.value;
+  t = ldbl_high (t);
   u = t * lg2_h;
   v = (p_l - (t - p_h)) * lg2 + t * lg2_l;
   z = u + v;
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_tanl.c b/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
index 1f6bad2..bcf8b5e 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
@@ -85,17 +85,17 @@ long double
 __kernel_tanl (long double x, long double y, int iy)
 {
   long double z, r, v, w, s;
-  int32_t ix, sign;
-  ieee854_long_double_shape_type u, u1;
+  int32_t ix, sign, hx, lx;
+  double xhi;
 
-  u.value = x;
-  ix = u.parts32.w0 & 0x7fffffff;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (hx, lx, xhi);
+  ix = hx & 0x7fffffff;
   if (ix < 0x3c600000)		/* x < 2**-57 */
     {
-      if ((int) x == 0)
-	{			/* generate inexact */
-	  if ((ix | u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3
-	       | (iy + 1)) == 0)
+      if ((int) x == 0)		/* generate inexact */
+	{
+	  if ((ix | lx | (iy + 1)) == 0)
 	    return one / fabs (x);
 	  else
 	    return (iy == 1) ? x : -one / x;
@@ -103,7 +103,7 @@ __kernel_tanl (long double x, long double y, int iy)
     }
   if (ix >= 0x3fe59420) /* |x| >= 0.6743316650390625 */
     {
-      if ((u.parts32.w0 & 0x80000000) != 0)
+      if ((hx & 0x80000000) != 0)
 	{
 	  x = -x;
 	  y = -y;
@@ -139,15 +139,13 @@ __kernel_tanl (long double x, long double y, int iy)
     {				/* if allow error up to 2 ulp,
 				   simply return -1.0/(x+r) here */
       /*  compute -1.0/(x+r) accurately */
-      u1.value = w;
-      u1.parts32.w2 = 0;
-      u1.parts32.w3 = 0;
-      v = r - (u1.value - x);		/* u1+v = r+x */
+      long double u1, z1;
+
+      u1 = ldbl_high (w);
+      v = r - (u1 - x);		/* u1+v = r+x */
       z = -1.0 / w;
-      u.value = z;
-      u.parts32.w2 = 0;
-      u.parts32.w3 = 0;
-      s = 1.0 + u.value * u1.value;
-      return u.value + z * (s + u.value * v);
+      z1 = ldbl_high (z);
+      s = 1.0 + z1 * u1;
+      return z1 + z * (s + z1 * v);
     }
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c b/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
index 8808dcd..007e785 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
@@ -92,19 +92,19 @@ long double
 __expm1l (long double x)
 {
   long double px, qx, xx;
-  int32_t ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t ix, lx, sign;
   int k;
+  double xhi;
 
   /* Detect infinity and NaN.  */
-  u.value = x;
-  ix = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (ix, lx, xhi);
   sign = ix & 0x80000000;
   ix &= 0x7fffffff;
   if (ix >= 0x7ff00000)
     {
       /* Infinity. */
-      if (((ix & 0xfffff) | u.parts32.w1 | (u.parts32.w2&0x7fffffff) | u.parts32.w3) == 0)
+      if (((ix - 0x7ff00000) | lx) == 0)
 	{
 	  if (sign)
 	    return -1.0L;
@@ -116,7 +116,7 @@ __expm1l (long double x)
     }
 
   /* expm1(+- 0) = +- 0.  */
-  if ((ix == 0) && (u.parts32.w1 | (u.parts32.w2&0x7fffffff) | u.parts32.w3) == 0)
+  if ((ix | lx) == 0)
     return x;
 
   /* Overflow.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c b/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
index 3ac5374..7e40663 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
@@ -36,16 +36,21 @@ two107 = 162259276829213363391578010288128.0; /* 0x4670000000000000, 0 */
 
 long double __frexpl(long double x, int *eptr)
 {
-	u_int64_t hx, lx, ix, ixl;
+	uint64_t hx, lx, ix, ixl;
 	int64_t explo;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	ixl = 0x7fffffffffffffffULL&lx;
 	ix =  0x7fffffffffffffffULL&hx;
 	*eptr = 0;
-	if(ix>=0x7ff0000000000000ULL||((ix|ixl)==0)) return x;	/* 0,inf,nan */
+	if(ix>=0x7ff0000000000000ULL||ix==0) return x;	/* 0,inf,nan */
 	if (ix<0x0010000000000000ULL) {		/* subnormal */
 	    x *= two107;
-	    GET_LDOUBLE_MSW64(hx,x);
+	    xhi = ldbl_high (x);
+	    EXTRACT_WORDS64 (hx, xhi);
 	    ix = hx&0x7fffffffffffffffULL;
 	    *eptr = -107;
 	}
@@ -54,7 +59,7 @@ long double __frexpl(long double x, int *eptr)
 	if (ixl != 0ULL) {
 	  explo = (ixl>>52) - (ix>>52) + 0x3fe;
 	  if ((ixl&0x7ff0000000000000ULL) == 0LL) {
-	    /* the lower double is a denomal so we need to correct its
+	    /* the lower double is a denormal so we need to correct its
 	       mantissa and perhaps its exponent.  */
 	    int cnt;
 
@@ -73,7 +78,9 @@ long double __frexpl(long double x, int *eptr)
 	  lx = 0ULL;
 
 	hx = (hx&0x800fffffffffffffULL) | 0x3fe0000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,hx,lx);
+	INSERT_WORDS64 (xhi, hx);
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c b/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
index c8dd9ff..54e72c9 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
@@ -1,6 +1,7 @@
 /*
  * __isinf_nsl(x) returns != 0 if x is ±inf, else 0;
  * no branching!
+ * slightly dodgy in relying on signed shift right copying sign bit
  */
 
 #include <math.h>
@@ -9,8 +10,14 @@
 int
 __isinf_nsl (long double x)
 {
-	int64_t hx,lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	return !((lx & 0x7fffffffffffffffLL)
-		 | ((hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL));
+  double xhi;
+  int64_t hx, mask;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+
+  mask = (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
+  mask |= -mask;
+  mask >>= 63;
+  return ~mask;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c b/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
index 5f5b014..6a72822 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
@@ -11,6 +11,7 @@ static char rcsid[] = "$NetBSD: $";
 /*
  * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0;
  * no branching!
+ * slightly dodgy in relying on signed shift right copying sign bit
  */
 
 #include <math.h>
@@ -20,12 +21,16 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___isinfl (long double x)
 {
-	int64_t hx,lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	lx = (lx & 0x7fffffffffffffffLL);
-	lx |= (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
-	lx |= -lx;
-	return ~(lx >> 63) & (hx >> 62);
+  double xhi;
+  int64_t hx, mask;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+
+  mask = (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
+  mask |= -mask;
+  mask >>= 63;
+  return ~mask & (hx >> 62);
 }
 hidden_ver (___isinfl, __isinfl)
 #ifndef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c b/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
index 77c4fde..a346383 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
@@ -126,19 +126,18 @@ long double
 __log1pl (long double xm1)
 {
   long double x, y, z, r, s;
-  ieee854_long_double_shape_type u;
-  int32_t hx;
+  double xhi;
+  int32_t hx, lx;
   int e;
 
   /* Test for NaN or infinity input. */
-  u.value = xm1;
-  hx = u.parts32.w0;
+  xhi = ldbl_high (xm1);
+  EXTRACT_WORDS (hx, lx, xhi);
   if (hx >= 0x7ff00000)
     return xm1;
 
   /* log1p(+- 0) = +- 0.  */
-  if (((hx & 0x7fffffff) == 0)
-      && (u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3) == 0)
+  if (((hx & 0x7fffffff) | lx) == 0)
     return xm1;
 
   x = xm1 + 1.0L;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_modfl.c b/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
index 39de9d4..ed03ce2 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
@@ -37,43 +37,54 @@ long double __modfl(long double x, long double *iptr)
 {
 	int64_t i0,i1,j0;
 	u_int64_t i;
-	GET_LDOUBLE_WORDS64(i0,i1,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (i0, xhi);
+	EXTRACT_WORDS64 (i1, xlo);
 	i1 &= 0x000fffffffffffffLL;
 	j0 = ((i0>>52)&0x7ff)-0x3ff;	/* exponent of x */
 	if(j0<52) {			/* integer part in high x */
 	    if(j0<0) {			/* |x|<1 */
 		/* *iptr = +-0 */
-	        SET_LDOUBLE_WORDS64(*iptr,i0&0x8000000000000000ULL,0);
+		INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		*iptr = xhi;
 		return x;
 	    } else {
 		i = (0x000fffffffffffffLL)>>j0;
 		if(((i0&i)|(i1&0x7fffffffffffffffLL))==0) {		/* x is integral */
 		    *iptr = x;
 		    /* return +-0 */
-		    SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+		    INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		    x = xhi;
 		    return x;
 		} else {
-		    SET_LDOUBLE_WORDS64(*iptr,i0&(~i),0);
+		    INSERT_WORDS64 (xhi, i0&(~i));
+		    *iptr = xhi;
 		    return x - *iptr;
 		}
 	    }
 	} else if (j0>103) {		/* no fraction part */
 	    *iptr = x*one;
 	    /* We must handle NaNs separately.  */
-	    if (j0 == 0x400 && ((i0 & 0x000fffffffffffffLL) | i1))
+	    if ((i0 & 0x7fffffffffffffffLL) > 0x7ff0000000000000LL)
 	      return x*one;
 	    /* return +-0 */
-	    SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+	    INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+	    x = xhi;
 	    return x;
 	} else {			/* fraction part in low x */
 	    i = -1ULL>>(j0-52);
 	    if((i1&i)==0) { 		/* x is integral */
 		*iptr = x;
 		/* return +-0 */
-		SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+		INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		x = xhi;
 		return x;
 	    } else {
-		SET_LDOUBLE_WORDS64(*iptr,i0,i1&(~i));
+		INSERT_WORDS64 (xhi, i0);
+		INSERT_WORDS64 (xlo, i1&(~i));
+		*iptr = ldbl_pack (xhi, xlo);
 		return x - *iptr;
 	    }
 	}
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c b/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
index 7e58127..c050944 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
@@ -30,27 +30,28 @@ static char rcsid[] = "$NetBSD: $";
 
 long double __nextafterl(long double x, long double y)
 {
-	int64_t hx,hy,ihx,ihy,ilx;
-	u_int64_t lx;
-	u_int64_t ly __attribute__ ((unused));
+	int64_t hx,hy,ihx,ihy;
+	uint64_t lx;
+	double xhi, xlo, yhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	ihx = hx&0x7fffffffffffffffLL;		/* |hx| */
-	ilx = lx&0x7fffffffffffffffLL;		/* |lx| */
 	ihy = hy&0x7fffffffffffffffLL;		/* |hy| */
 
-	if((((ihx&0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
-	    ((ihx&0x000fffffffffffffLL)!=0)) ||   /* x is nan */
-	   (((ihy&0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
-	    ((ihy&0x000fffffffffffffLL)!=0)))     /* y is nan */
+	if((ihx>0x7ff0000000000000LL) ||	/* x is nan */
+	   (ihy>0x7ff0000000000000LL))		/* y is nan */
 	    return x+y; /* signal the nan */
 	if(x==y)
 	    return y;		/* x=y, return y */
-	if(ihx == 0 && ilx == 0) {			/* x == 0 */
-	    long double u;
+	if(ihx == 0) {				/* x == 0 */
+	    long double u;			/* return +-minsubnormal */
 	    hy = (hy & 0x8000000000000000ULL) | 1;
-	    SET_LDOUBLE_WORDS64(x,hy,0ULL);/* return +-minsubnormal */
+	    INSERT_WORDS64 (yhi, hy);
+	    x = yhi;
 	    u = math_opt_barrier (x);
 	    u = u * u;
 	    math_force_eval (u);		/* raise underflow flag */
@@ -59,10 +60,16 @@ long double __nextafterl(long double x, long double y)
 
 	long double u;
 	if(x > y) {	/* x > y, x -= ulp */
+	    /* This isn't the largest magnitude correctly rounded
+	       long double as you can see from the lowest mantissa
+	       bit being zero.  It is however the largest magnitude
+	       long double with a 106 bit mantissa, and nextafterl
+	       is insane with variable precision.  So to make
+	       nextafterl sane we assume 106 bit precision.  */
 	    if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
 	      return x+x;	/* overflow, return -inf */
 	    if (hx >= 0x7ff0000000000000LL) {
-	      SET_LDOUBLE_WORDS64(u,0x7fefffffffffffffLL,0x7c8ffffffffffffeLL);
+	      u = 0x1.fffffffffffff7ffffffffffff8p+1023L;
 	      return u;
 	    }
 	    if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
@@ -77,16 +84,19 @@ long double __nextafterl(long double x, long double y)
 	      return x;
 	    }
 	    if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+	      INSERT_WORDS64 (yhi, hx & (0x7ffLL<<52));
+	      u = yhi;
 	      u *= 0x1.0000000000000p-105L;
-	    } else
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+	    } else {
+	      INSERT_WORDS64 (yhi, (hx & (0x7ffLL<<52))-(0x069LL<<52));
+	      u = yhi;
+	    }
 	    return x - u;
 	} else {				/* x < y, x += ulp */
 	    if((hx==0x7fefffffffffffffLL)&&(lx==0x7c8ffffffffffffeLL))
 	      return x+x;	/* overflow, return +inf */
-	    if ((u_int64_t) hx >= 0xfff0000000000000ULL) {
-	      SET_LDOUBLE_WORDS64(u,0xffefffffffffffffLL,0xfc8ffffffffffffeLL);
+	    if ((uint64_t) hx >= 0xfff0000000000000ULL) {
+	      u = -0x1.fffffffffffff7ffffffffffff8p+1023L;
 	      return u;
 	    }
 	    if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
@@ -103,10 +113,13 @@ long double __nextafterl(long double x, long double y)
 	      return x;
 	    }
 	    if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+	      INSERT_WORDS64 (yhi, hx & (0x7ffLL<<52));
+	      u = yhi;
 	      u *= 0x1.0000000000000p-105L;
-	    } else
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+	    } else {
+	      INSERT_WORDS64 (yhi, (hx & (0x7ffLL<<52))-(0x069LL<<52));
+	      u = yhi;
+	    }
 	    return x + u;
 	}
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c b/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
index 7e288a4..b40cf16 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
@@ -34,23 +34,22 @@ double __nexttoward(double x, long double y)
 {
 	int32_t hx,ix;
 	int64_t hy,iy;
-	u_int32_t lx;
-	u_int64_t ly,uly;
+	uint32_t lx;
+	double yhi;
 
 	EXTRACT_WORDS(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64(hy,yhi);
 	ix = hx&0x7fffffff;		/* |x| */
 	iy = hy&0x7fffffffffffffffLL;	/* |y| */
-	uly = ly&0x7fffffffffffffffLL;	/* |y| */
 
 	if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) ||   /* x is nan */
-	   ((iy>=0x7ff0000000000000LL)&&((iy-0x7ff0000000000000LL)|uly)!=0))
-							    /* y is nan */
+	   iy>0x7ff0000000000000LL)			    /* y is nan */
 	   return x+y;
 	if((long double) x==y) return y;	/* x=y, return y */
 	if((ix|lx)==0) {			/* x == 0 */
 	    double u;
-	    INSERT_WORDS(x,(u_int32_t)((hy>>32)&0x80000000),1);/* return +-minsub */
+	    INSERT_WORDS(x,(uint32_t)((hy>>32)&0x80000000),1);/* return +-minsub */
 	    u = math_opt_barrier (x);
 	    u = u * u;
 	    math_force_eval (u);		/* raise underflow flag */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c b/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
index b387a91..19522f4 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
@@ -27,16 +27,16 @@ float __nexttowardf(float x, long double y)
 {
 	int32_t hx,ix;
 	int64_t hy,iy;
-	u_int64_t ly, uly;
+	double yhi;
 
 	GET_FLOAT_WORD(hx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	ix = hx&0x7fffffff;		/* |x| */
 	iy = hy&0x7fffffffffffffffLL;	/* |y| */
-	uly = ly&0x7fffffffffffffffLL;	/* |y| */
 
 	if((ix>0x7f800000) ||   /* x is nan */
-	   ((iy>=0x7ff0000000000000LL)&&((iy-0x7ff0000000000000LL)|uly)!=0))
+	   (iy>0x7ff0000000000000LL))
 				/* y is nan */
 	   return x+y;
 	if((long double) x==y) return y;	/* x=y, return y */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_remquol.c b/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
index f4777a0..195e108 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
@@ -33,20 +33,24 @@ __remquol (long double x, long double y, int *quo)
   int64_t hx,hy;
   u_int64_t sx,lx,ly,qs;
   int cquo;
-
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  GET_LDOUBLE_WORDS64 (hy, ly, y);
+  double xhi, xlo, yhi, ylo;
+
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS64 (hx, xhi);
+  EXTRACT_WORDS64 (lx, xlo);
+  ldbl_unpack (y, &yhi, &ylo);
+  EXTRACT_WORDS64 (hy, yhi);
+  EXTRACT_WORDS64 (ly, ylo);
   sx = hx & 0x8000000000000000ULL;
   qs = sx ^ (hy & 0x8000000000000000ULL);
   hy &= 0x7fffffffffffffffLL;
   hx &= 0x7fffffffffffffffLL;
 
   /* Purge off exception values.  */
-  if ((hy | (ly & 0x7fffffffffffffff)) == 0)
+  if (hy == 0)
     return (x * y) / (x * y); 			/* y = 0 */
   if ((hx >= 0x7ff0000000000000LL)		/* x not finite */
-      || ((hy >= 0x7ff0000000000000LL)		/* y is NaN */
-	  && (((hy - 0x7ff0000000000000LL) | ly) != 0)))
+      || (hy > 0x7ff0000000000000LL))		/* y is NaN */
     return (x * y) / (x * y);
 
   if (hy <= 0x7fbfffffffffffffLL)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c b/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
index d752568..03d4597 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
@@ -41,11 +41,15 @@ long double __scalblnl (long double x, long int n)
 {
 	int64_t k,l,hx,lx;
 	union { int64_t i; double d; } u;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	k = (hx>>52)&0x7ff;		/* extract exponent */
 	l = (lx>>52)&0x7ff;
 	if (k==0) {				/* 0 or subnormal x */
-	    if (((hx|lx)&0x7fffffffffffffffULL)==0) return x; /* +-0 */
+	    if ((hx&0x7fffffffffffffffULL)==0) return x; /* +-0 */
 	    u.i = hx;
 	    u.d *= two54;
 	    hx = u.i;
@@ -61,7 +65,9 @@ long double __scalblnl (long double x, long int n)
 	if (k > 0) {				/* normal result */
 	    hx = (hx&0x800fffffffffffffULL)|(k<<52);
 	    if ((lx & 0x7fffffffffffffffULL) == 0) { /* low part +-0 */
-		SET_LDOUBLE_WORDS64(x,hx,lx);
+		INSERT_WORDS64 (xhi, hx);
+		INSERT_WORDS64 (xlo, lx);
+		x = ldbl_pack (xhi, xlo);
 		return x;
 	    }
 	    if (l == 0) { /* low part subnormal */
@@ -81,14 +87,19 @@ long double __scalblnl (long double x, long int n)
 		u.d *= twom54;
 		lx = u.i;
 	    }
-	    SET_LDOUBLE_WORDS64(x,hx,lx);
+	    INSERT_WORDS64 (xhi, hx);
+	    INSERT_WORDS64 (xlo, lx);
+	    x = ldbl_pack (xhi, xlo);
 	    return x;
 	}
 	if (k <= -54)
 	  return tiny*__copysignl(tiny,x); 	/*underflow*/
 	k += 54;				/* subnormal result */
 	lx &= 0x8000000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,(hx&0x800fffffffffffffULL)|(k<<52),lx);
+	hx &= 0x800fffffffffffffULL;
+	INSERT_WORDS64 (xhi, hx|(k<<52));
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x*twolm54;
 }
 long_double_symbol (libm, __scalblnl, scalblnl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c b/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
index bcdb23b..161172d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
@@ -41,11 +41,15 @@ long double __scalbnl (long double x, int n)
 {
 	int64_t k,l,hx,lx;
 	union { int64_t i; double d; } u;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	k = (hx>>52)&0x7ff;		/* extract exponent */
 	l = (lx>>52)&0x7ff;
 	if (k==0) {				/* 0 or subnormal x */
-	    if (((hx|lx)&0x7fffffffffffffffULL)==0) return x; /* +-0 */
+	    if ((hx&0x7fffffffffffffffULL)==0) return x; /* +-0 */
 	    u.i = hx;
 	    u.d *= two54;
 	    hx = u.i;
@@ -61,7 +65,9 @@ long double __scalbnl (long double x, int n)
 	if (k > 0) {				/* normal result */
 	    hx = (hx&0x800fffffffffffffULL)|(k<<52);
 	    if ((lx & 0x7fffffffffffffffULL) == 0) { /* low part +-0 */
-		SET_LDOUBLE_WORDS64(x,hx,lx);
+		INSERT_WORDS64 (xhi, hx);
+		INSERT_WORDS64 (xlo, lx);
+		x = ldbl_pack (xhi, xlo);
 		return x;
 	    }
 	    if (l == 0) { /* low part subnormal */
@@ -81,14 +87,19 @@ long double __scalbnl (long double x, int n)
 		u.d *= twom54;
 		lx = u.i;
 	    }
-	    SET_LDOUBLE_WORDS64(x,hx,lx);
+	    INSERT_WORDS64 (xhi, hx);
+	    INSERT_WORDS64 (xlo, lx);
+	    x = ldbl_pack (xhi, xlo);
 	    return x;
 	}
 	if (k <= -54)
 	  return tiny*__copysignl(tiny,x); 	/*underflow*/
 	k += 54;				/* subnormal result */
 	lx &= 0x8000000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,(hx&0x800fffffffffffffULL)|(k<<52),lx);
+	hx &= 0x800fffffffffffffULL;
+	INSERT_WORDS64 (xhi, hx|(k<<52));
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x*twolm54;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c b/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
index 138b63c..c63e253 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
@@ -47,10 +47,12 @@ static const long double one=1.0L, two=2.0L, tiny = 1.0e-300L;
 long double __tanhl(long double x)
 {
 	long double t,z;
-	int64_t jx,ix,lx;
+	int64_t jx,ix;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_WORDS64(jx,lx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (jx, xhi);
 	ix = jx&0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
@@ -61,7 +63,7 @@ long double __tanhl(long double x)
 
     /* |x| < 22 */
 	if (ix < 0x4036000000000000LL) {		/* |x|<22 */
-	    if ((ix | (lx&0x7fffffffffffffffLL)) == 0)
+	    if (ix == 0)
 		return x;		/* x == +-0 */
 	    if (ix<0x3c60000000000000LL) 	/* |x|<2**-57 */
 		return x*(one+x);    	/* tanh(small) = small */
diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps
index 74365f0..37b2ca1 100644
--- a/sysdeps/powerpc/fpu/libm-test-ulps
+++ b/sysdeps/powerpc/fpu/libm-test-ulps
@@ -6640,6 +6640,9 @@ float: 1
 ifloat: 1
 ildouble: 2
 ldouble: 2
+Test "tan_towardzero (2)":
+ildouble: 1
+ldouble: 1
 Test "tan_towardzero (3)":
 float: 1
 ifloat: 1

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c3a280ad2b872926de77f0ada0740ed819374ba1

commit c3a280ad2b872926de77f0ada0740ed819374ba1
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:33:02 2013 -0500

    PowerPC: floating point little-endian
    
    This patch replaces occurrences of GET_LDOUBLE_* and SET_LDOUBLE_*
    macros, and union ieee854_long_double_shape_type in ldbl-128ibm/,
    and a stray one in the 32-bit fpu support.  These files have no
    significant changes apart from rewriting the long double bit access.
    
    In the following I make use of the fact that casting a long double to
    double retrieves the high double, rather than calling ldbl_unpack
    everywhere.
    
    I generally resisted the urge to make formatting fixes in this series
    of patches, thinking that would be better as a separate patch.

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c b/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
index abc78a3..8a4a5bb 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
@@ -36,8 +36,12 @@ __ieee754_acoshl(long double x)
 {
 	long double t;
 	int64_t hx;
-	u_int64_t lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	uint64_t lx;
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	if(hx<0x3ff0000000000000LL) {		/* x < 1 */
 	    return (x-x)/(x-x);
 	} else if(hx >=0x41b0000000000000LL) {	/* x > 2**28 */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
index 5d2af30..8663993 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
@@ -151,26 +151,25 @@ static const long double
 long double
 __ieee754_acosl (long double x)
 {
-  long double z, r, w, p, q, s, t, f2;
-  ieee854_long_double_shape_type u;
+  long double a, z, r, w, p, q, s, t, f2;
 
-  u.value = __builtin_fabsl (x);
-  if (u.value == 1.0L)
+  a = __builtin_fabsl (x);
+  if (a == 1.0L)
     {
       if (x > 0.0L)
 	return 0.0;		/* acos(1) = 0  */
       else
 	return (2.0 * pio2_hi) + (2.0 * pio2_lo);	/* acos(-1)= pi */
     }
-  else if (u.value > 1.0L)
+  else if (a > 1.0L)
     {
       return (x - x) / (x - x);	/* acos(|x| > 1) is NaN */
     }
-  if (u.value < 0.5L)
+  if (a < 0.5L)
     {
-      if (u.value < 6.938893903907228e-18L)	/* |x| < 2**-57 */
+      if (a < 6.938893903907228e-18L)	/* |x| < 2**-57 */
 	return pio2_hi + pio2_lo;
-      if (u.value < 0.4375L)
+      if (a < 0.4375L)
 	{
 	  /* Arcsine of x.  */
 	  z = x * x;
@@ -199,7 +198,7 @@ __ieee754_acosl (long double x)
 	  return z;
 	}
       /* .4375 <= |x| < .5 */
-      t = u.value - 0.4375L;
+      t = a - 0.4375L;
       p = ((((((((((P10 * t
 		    + P9) * t
 		   + P8) * t
@@ -230,9 +229,9 @@ __ieee754_acosl (long double x)
 	r = acosr4375 + r;
       return r;
     }
-  else if (u.value < 0.625L)
+  else if (a < 0.625L)
     {
-      t = u.value - 0.5625L;
+      t = a - 0.5625L;
       p = ((((((((((rS10 * t
 		    + rS9) * t
 		   + rS8) * t
@@ -264,7 +263,9 @@ __ieee754_acosl (long double x)
     }
   else
     {				/* |x| >= .625 */
-      z = (one - u.value) * 0.5;
+      double shi, slo;
+
+      z = (one - a) * 0.5;
       s = __ieee754_sqrtl (z);
       /* Compute an extended precision square root from
 	 the Newton iteration  s -> 0.5 * (s + z / s).
@@ -273,12 +274,11 @@ __ieee754_acosl (long double x)
 	  Express s = f1 + f2 where f1 * f1 is exactly representable.
 	  w = (z - s^2)/2s = (z - f1^2 - 2 f1 f2 - f2^2)/2s .
 	  s + w has extended precision.  */
-      u.value = s;
-      u.parts32.w2 = 0;
-      u.parts32.w3 = 0;
-      f2 = s - u.value;
-      w = z - u.value * u.value;
-      w = w - 2.0 * u.value * f2;
+      ldbl_unpack (s, &shi, &slo);
+      a = shi;
+      f2 = slo;
+      w = z - a * a;
+      w = w - 2.0 * a * f2;
       w = w - f2 * f2;
       w = w / (2.0 * s);
       /* Arcsine of s.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
index b395439..99a5b85 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
@@ -131,19 +131,18 @@ static const long double
 long double
 __ieee754_asinl (long double x)
 {
-  long double t, w, p, q, c, r, s;
+  long double a, t, w, p, q, c, r, s;
   int flag;
-  ieee854_long_double_shape_type u;
 
   flag = 0;
-  u.value = __builtin_fabsl (x);
-  if (u.value == 1.0L)	/* |x|>= 1 */
+  a = __builtin_fabsl (x);
+  if (a == 1.0L)	/* |x|>= 1 */
     return x * pio2_hi + x * pio2_lo;	/* asin(1)=+-pi/2 with inexact */
-  else if (u.value >= 1.0L)
+  else if (a >= 1.0L)
     return (x - x) / (x - x);	/* asin(|x|>1) is NaN */
-  else if (u.value < 0.5L)
+  else if (a < 0.5L)
     {
-      if (u.value < 6.938893903907228e-18L) /* |x| < 2**-57 */
+      if (a < 6.938893903907228e-18L) /* |x| < 2**-57 */
 	{
 	  if (huge + x > one)
 	    return x;		/* return x with inexact if x!=0 */
@@ -155,9 +154,9 @@ __ieee754_asinl (long double x)
 	  flag = 1;
 	}
     }
-  else if (u.value < 0.625L)
+  else if (a < 0.625L)
     {
-      t = u.value - 0.5625;
+      t = a - 0.5625;
       p = ((((((((((rS10 * t
 		    + rS9) * t
 		   + rS8) * t
@@ -190,7 +189,7 @@ __ieee754_asinl (long double x)
   else
     {
       /* 1 > |x| >= 0.625 */
-      w = one - u.value;
+      w = one - a;
       t = w * 0.5;
     }
 
@@ -223,17 +222,14 @@ __ieee754_asinl (long double x)
     }
 
   s = __ieee754_sqrtl (t);
-  if (u.value > 0.975L)
+  if (a > 0.975L)
     {
       w = p / q;
       t = pio2_hi - (2.0 * (s + s * w) - pio2_lo);
     }
   else
     {
-      u.value = s;
-      u.parts32.w3 = 0;
-      u.parts32.w2 = 0;
-      w = u.value;
+      w = ldbl_high (s);
       c = (t - w * w) / (s + w);
       r = p / q;
       p = 2.0 * s * r - (pio2_lo - 2.0 * c);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c b/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
index f35182f..29f2e92 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
@@ -40,8 +40,10 @@ __ieee754_atanhl(long double x)
 {
 	long double t;
 	int64_t hx,ix;
-	u_int64_t lx __attribute__ ((unused));
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi;
+
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	ix = hx&0x7fffffffffffffffLL;
 	if (ix >= 0x3ff0000000000000LL) { /* |x|>=1 */
 	    if (ix > 0x3ff0000000000000LL)
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_coshl.c b/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
index 3e8e187..05683bc 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
@@ -41,9 +41,11 @@ __ieee754_coshl (long double x)
 {
 	long double t,w;
 	int64_t ix;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 	ix &= 0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_log2l.c b/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
index f0098f6..323ded0 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
@@ -177,11 +177,13 @@ __ieee754_log2l (x)
   long double z;
   long double y;
   int e;
-  int64_t hx, lx;
+  int64_t hx;
+  double xhi;
 
 /* Test for domain */
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  if (((hx & 0x7fffffffffffffffLL) | (lx & 0x7fffffffffffffffLL)) == 0)
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     return (-1.0L / (x - x));
   if (hx < 0)
     return (x - x) / (x - x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
index 8885def..36bc032 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
@@ -200,10 +200,11 @@ int32_t __ieee754_rem_pio2l(long double x, long double *y)
   double tx[8];
   int exp;
   int64_t n, ix, hx, ixd;
-  u_int64_t lx __attribute__ ((unused));
   u_int64_t lxd;
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
   ix = hx & 0x7fffffffffffffffLL;
   if (ix <= 0x3fe921fb54442d10LL)	/* x in <-pi/4, pi/4> */
     {
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c b/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
index 4e8481c..1790bef 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
@@ -38,9 +38,11 @@ __ieee754_sinhl(long double x)
 {
 	long double t,w,h;
 	int64_t ix,jx;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_MSW64(jx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (jx, xhi);
 	ix = jx&0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_cosl.c b/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
index 0b81782..046f3b5 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
@@ -81,8 +81,11 @@ __kernel_cosl(long double x, long double y)
 {
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
-  u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
+  uint32_t tix, hix, index;
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
   tix = ((u_int64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
@@ -136,7 +139,8 @@ __kernel_cosl(long double x, long double y)
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       l = y - (h - x);
       z = l * l;
       sin_l = l*(ONE+z*(SSIN1+z*(SSIN2+z*(SSIN3+z*(SSIN4+z*SSIN5)))));
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c b/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
index fc1ead6..3ba9d7e 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
@@ -100,9 +100,12 @@ __kernel_sincosl(long double x, long double y, long double *sinx, long double *c
 {
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
-  u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
-  tix = ((u_int64_t)ix) >> 32;
+  uint32_t tix, hix, index;
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
+  tix = ((uint64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
     {
@@ -164,7 +167,8 @@ __kernel_sincosl(long double x, long double y, long double *sinx, long double *c
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       if (iy)
 	l = y - (h - x);
       else
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_sinl.c b/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
index f17c0ae..b12ea13 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
@@ -82,7 +82,10 @@ __kernel_sinl(long double x, long double y, int iy)
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
   u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
   tix = ((u_int64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
@@ -132,7 +135,8 @@ __kernel_sinl(long double x, long double y, int iy)
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       if (iy)
 	l = (ix < 0 ? -y : y) - (h - x);
       else
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 69bb2e6..d69d344 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -192,6 +192,9 @@ default_ldbl_unpack (long double l, double *a, double *aa)
 # define ldbl_unpack default_ldbl_unpack
 #endif
 
+/* Extract high double.  */
+#define ldbl_high(x) ((double) x)
+
 /* Convert a finite long double to canonical form.
    Does not handle +/-Inf properly.  */
 static inline void
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c b/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
index a833457..63c6edb 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
@@ -38,7 +38,10 @@ long double __asinhl(long double x)
 {
 	long double t,w;
 	int64_t hx,ix;
-	GET_LDOUBLE_MSW64(hx,x);
+	double xhi;
+
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	ix = hx&0x7fffffffffffffffLL;
 	if(ix>=0x7ff0000000000000LL) return x+x;	/* x is inf or NaN */
 	if(ix< 0x3e20000000000000LL) {	/* |x|<2**-29 */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_atanl.c b/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
index 2a36d16..41dde23 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
@@ -173,23 +173,20 @@ static const long double
 long double
 __atanl (long double x)
 {
-  int k, sign;
+  int32_t k, sign, lx;
   long double t, u, p, q;
-  ieee854_long_double_shape_type s;
+  double xhi;
 
-  s.value = x;
-  k = s.parts32.w0;
-  if (k & 0x80000000)
-    sign = 1;
-  else
-    sign = 0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (k, lx, xhi);
+  sign = k & 0x80000000;
 
   /* Check for IEEE special cases.  */
   k &= 0x7fffffff;
   if (k >= 0x7ff00000)
     {
       /* NaN. */
-      if ((k & 0xfffff) | s.parts32.w1 )
+      if (((k - 0x7ff00000) | lx) != 0)
 	return (x + x);
 
       /* Infinity. */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_cosl.c b/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
index 2314839..54c6cc7 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
@@ -53,9 +53,11 @@ long double __cosl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c b/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
index 99146d8..c801c97 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
@@ -29,10 +29,16 @@ static char rcsid[] = "$NetBSD: $";
 long double __fabsl(long double x)
 {
 	u_int64_t hx, lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	lx = lx ^ ( hx & 0x8000000000000000LL );
 	hx = hx & 0x7fffffffffffffffLL;
-	SET_LDOUBLE_WORDS64(x,hx,lx);
+	INSERT_WORDS64 (xhi, hx);
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x;
 }
 long_double_symbol (libm, __fabsl, fabsl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_finitel.c b/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
index 8edb341..7b4655f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
@@ -29,10 +29,14 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___finitel (long double x)
 {
-	int64_t hx;
-	GET_LDOUBLE_MSW64(hx,x);
-	return (int)((u_int64_t)((hx&0x7fffffffffffffffLL)
-				 -0x7ff0000000000000LL)>>63);
+  uint64_t hx;
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  hx &= 0x7fffffffffffffffLL;
+  hx -= 0x7ff0000000000000LL;
+  return hx >> 63;
 }
 hidden_ver (___finitel, __finitel)
 weak_alias (___finitel, ____finitel)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c b/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
index f4a90b0..90586e8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
@@ -46,8 +46,10 @@ ___fpclassifyl (long double x)
 {
   u_int64_t hx, lx;
   int retval = FP_NORMAL;
+  double xhi, xlo;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS64 (hx, xhi);
   if ((hx & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
       /* +/-NaN or +/-Inf */
       if (hx & 0x000fffffffffffffULL) {
@@ -65,6 +67,7 @@ ___fpclassifyl (long double x)
 	      retval = FP_NORMAL;
 	  } else {
 	      if ((hx & 0x7ff0000000000000ULL) == 0x0360000000000000ULL) {
+		  EXTRACT_WORDS64 (lx, xlo);
 		  if ((lx & 0x7fffffffffffffff)	/* lower is non-zero */
 		  && ((lx^hx) & 0x8000000000000000ULL)) { /* and sign differs */
 		      /* +/- denormal */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c b/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
index 264dec7..d12f1d3 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
@@ -29,12 +29,14 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___isnanl (long double x)
 {
-	int64_t hx;
-	int64_t lx __attribute__ ((unused));
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	hx &= 0x7fffffffffffffffLL;
-	hx = 0x7ff0000000000000LL - hx;
-	return (int)((u_int64_t)hx>>63);
+  uint64_t hx;
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  hx &= 0x7fffffffffffffffLL;
+  hx = 0x7ff0000000000000LL - hx;
+  return (int) (hx >> 63);
 }
 hidden_ver (___isnanl, __isnanl)
 #ifndef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c b/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
index 96fab1a..bdd58f8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
@@ -22,10 +22,13 @@
 int
 __issignalingl (long double x)
 {
-  u_int64_t xi;
+  uint64_t xi;
   /* For inspecting NaN status, we only have to look at the first of the pair
      of IEEE 754 64-bit precision numbers.  */
-  GET_LDOUBLE_MSW64 (xi, x);
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (xi, xhi);
 #ifdef HIGH_ORDER_BIT_IS_SET_FOR_SNAN
 # error untested
   /* We only have to care about the high-order bit of x's significand, because
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_logbl.c b/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
index 6cbfcfa..e140288 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
@@ -27,9 +27,10 @@ long double
 __logbl (long double x)
 {
   int64_t hx, rhx;
-  int64_t lx __attribute__ ((unused));
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
   hx &= 0x7fffffffffffffffLL;	/* high |x| */
   if (hx == 0)
     return -1.0 / fabs (x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c b/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
index ee4aea6..aecb1fd 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
@@ -25,8 +25,10 @@ int
 ___signbitl (long double x)
 {
   int64_t e;
+  double xhi;
 
-  GET_LDOUBLE_MSW64 (e, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (e, xhi);
   return e < 0;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c b/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
index 3b1e547..a9e2f3d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
@@ -27,9 +27,11 @@ void
 __sincosl (long double x, long double *sinx, long double *cosx)
 {
   int64_t ix;
+  double xhi;
 
   /* High word of x. */
-  GET_LDOUBLE_MSW64 (ix, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
 
   /* |x| ~< pi/4 */
   ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_sinl.c b/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
index 6fec16f..087921a 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
@@ -53,9 +53,11 @@ long double __sinl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_tanl.c b/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
index 9967d0c..66b8a06 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
@@ -53,9 +53,11 @@ long double __tanl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c b/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
index e008ed0..1c82577 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
@@ -35,14 +35,14 @@ static const union {
 long double
 __logbl (long double x)
 {
-  double xh, xl;
+  double xh;
   double ret;
 
   if (__builtin_expect (x == 0.0L, 0))
     /* Raise FE_DIVBYZERO and return -HUGE_VAL[LF].  */
     return -1.0L / __builtin_fabsl (x);
 
-  ldbl_unpack (x, &xh, &xl);
+  xh = ldbl_high (x);
   /* ret = x & 0x7ff0000000000000;  */
   asm (
     "xxland %x0,%x1,%x2\n"
@@ -58,9 +58,9 @@ __logbl (long double x)
     {
       /* POSIX specifies that denormal number is treated as
          though it were normalized.  */
-      int64_t lx, hx;
+      int64_t hx;
 
-      GET_LDOUBLE_WORDS64 (hx, lx, x);
+      EXTRACT_WORDS64 (hx, xh);
       return (long double) (-1023 - (__builtin_clzll (hx) - 12));
     }
   /* Test to avoid logb_downward (0.0) == -0.0.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=407c444872942dc7232c09d0c3c73618807ade11

commit 407c444872942dc7232c09d0c3c73618807ade11
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:22:08 2013 -0500

    PowerPC: floating point little-endian
    
    This is the first of a series of patches to ban ieee854_long_double
    and the ieee854_long_double macros when using IBM long double.  union
    ieee854_long_double just isn't correct for IBM long double, especially
    when little-endian, and pretending it is OK has allowed a number of
    bugs to remain undetected in sysdeps/ieee754/ldbl-128ibm/.

diff --git a/math/test-misc.c b/math/test-misc.c
index 27d673b..f5276eb 100644
--- a/math/test-misc.c
+++ b/math/test-misc.c
@@ -722,300 +722,161 @@ main (void)
 
 #ifndef NO_LONG_DOUBLE
   {
-    union ieee854_long_double v1;
-    union ieee854_long_double v2;
-    long double ld;
+    long double v1, v2;
 
-    v1.d = ld = LDBL_MIN;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = LDBL_MIN;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (LDBL_MIN) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MIN) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, LDBL_MIN / 2.0);
-    if (fpclassify (ld) != FP_SUBNORMAL)
+    v2 = nextafterl (v1, LDBL_MIN / 2.0);
+    if (fpclassify (v2) != FP_SUBNORMAL)
       {
 	printf ("fpclassify (LDBL_MIN-epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
-    v2.d = ld = nextafterl (ld, LDBL_MIN);
-    if (fpclassify (ld) != FP_NORMAL)
+    v2 = nextafterl (v2, LDBL_MIN);
+    if (fpclassify (v2) != FP_NORMAL)
       {
 	printf ("fpclassify (LDBL_MIN-epsilon+epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
+    if (v1 != v2)
       {
-	printf ("LDBL_MIN: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("LDBL_MIN: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("LDBL_MIN: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("LDBL_MIN: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("LDBL_MIN-epsilon+epsilon != LDBL_MIN: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -LDBL_MIN;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = -LDBL_MIN;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (-LDBL_MIN) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (-LDBL_MIN) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, -LDBL_MIN / 2.0);
-    if (fpclassify (ld) != FP_SUBNORMAL)
+    v2 = nextafterl (v1, -LDBL_MIN / 2.0);
+    if (fpclassify (v2) != FP_SUBNORMAL)
       {
 	printf ("fpclassify (-LDBL_MIN-epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
-    v2.d = ld = nextafterl (ld, -LDBL_MIN);
-    if (fpclassify (ld) != FP_NORMAL)
+    v2 = nextafterl (v2, -LDBL_MIN);
+    if (fpclassify (v2) != FP_NORMAL)
       {
 	printf ("fpclassify (-LDBL_MIN-epsilon+epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("-LDBL_MIN: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-LDBL_MIN: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("-LDBL_MIN: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-LDBL_MIN: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-LDBL_MIN-epsilon+epsilon != -LDBL_MIN: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    ld = LDBL_MAX;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = LDBL_MAX;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (LDBL_MAX) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MAX) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, INFINITY);
-    if (fpclassify (ld) != FP_INFINITE)
+    v2 = nextafterl (v1, INFINITY);
+    if (fpclassify (v2) != FP_INFINITE)
       {
-	printf ("fpclassify (LDBL_MAX+epsilon) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MAX+epsilon) failed: %d (%La)\n",
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    ld = -LDBL_MAX;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = -LDBL_MAX;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (-LDBL_MAX) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (-LDBL_MAX) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, -INFINITY);
-    if (fpclassify (ld) != FP_INFINITE)
+    v2 = nextafterl (v1, -INFINITY);
+    if (fpclassify (v2) != FP_INFINITE)
       {
-	printf ("fpclassify (-LDBL_MAX-epsilon) failed: %d\n",
-		fpclassify (ld));
+	printf ("fpclassify (-LDBL_MAX-epsilon) failed: %d (%La)\n",
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    v1.d = ld = 0.0625;
-    ld = nextafterl (ld, 0.0);
-    v2.d = ld = nextafterl (ld, 1.0);
+    v1 = 0.0625;
+    v2 = nextafterl (v1, 0.0);
+    v2 = nextafterl (v2, 1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0625L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0625L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0625L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
+    if (v1 != v2)
       {
-	printf ("0.0625L down: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("0.0625L-epsilon+epsilon != 0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = 0.0625;
-    ld = nextafterl (ld, 1.0);
-    v2.d = ld = nextafterl (ld, 0.0);
+    v1 = 0.0625;
+    v2 = nextafterl (v1, 1.0);
+    v2 = nextafterl (v2, 0.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0625L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0625L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0625L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
+    if (v1 != v2)
       {
-	printf ("0.0625L up: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("0.0625L+epsilon-epsilon != 0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -0.0625;
-    ld = nextafterl (ld, 0.0);
-    v2.d = ld = nextafterl (ld, -1.0);
+    v1 = -0.0625;
+    v2 = nextafterl (v1, 0.0);
+    v2 = nextafterl (v2, -1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
+    if (v1 != v2)
       {
-	printf ("-0.0625L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-0.0625L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("-0.0625L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-0.0625L up: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-0.0625L+epsilon-epsilon != -0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -0.0625;
-    ld = nextafterl (ld, -1.0);
-    v2.d = ld = nextafterl (ld, 0.0);
+    v1 = -0.0625;
+    v2 = nextafterl (v1, -1.0);
+    v2 = nextafterl (v2, 0.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("-0.0625L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-0.0625L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("-0.0625L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-0.0625L down: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-0.0625L-epsilon+epsilon != -0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = 0.0;
-    ld = nextafterl (ld, 1.0);
-    v2.d = nextafterl (ld, -1.0);
+    v1 = 0.0;
+    v2 = nextafterl (v1, 1.0);
+    v2 = nextafterl (v2, -1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("0.0L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
+	printf ("0.0+epsilon-epsilon != 0.0L: %La vs %La\n", v2, v1);
 	result = 1;
       }
-    if (0 != v2.ieee.negative)
+    if (signbit (v2))
       {
-	printf ("0.0L up: negative differs: 0 vs %d\n",
-		v2.ieee.negative);
+	printf ("0.0+epsilon-epsilon is negative\n");
 	result = 1;
       }
 
-    v1.d = ld = 0.0;
-    ld = nextafterl (ld, -1.0);
-    v2.d = nextafterl (ld, 1.0);
+    v1 = 0.0;
+    v2 = nextafterl (v1, -1.0);
+    v2 = nextafterl (v2, 1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
+    if (v1 != v2)
       {
-	printf ("0.0L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
+	printf ("0.0-epsilon+epsilon != 0.0L: %La vs %La\n", v2, v1);
 	result = 1;
       }
-    if (1 != v2.ieee.negative)
+    if (!signbit (v2))
       {
-	printf ("0.0L down: negative differs: 1 vs %d\n",
-		v2.ieee.negative);
+	printf ("0.0-epsilon+epsilon is positive\n");
 	result = 1;
       }
 
diff --git a/ports/sysdeps/ia64/fpu/printf_fphex.c b/ports/sysdeps/ia64/fpu/printf_fphex.c
index ca02142..0698cda 100644
--- a/ports/sysdeps/ia64/fpu/printf_fphex.c
+++ b/ports/sysdeps/ia64/fpu/printf_fphex.c
@@ -25,9 +25,11 @@ do {									      \
       /* The "strange" 80 bit format on ia64 has an explicit		      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -49,8 +51,8 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
-									     \
+      exponent = u.ieee.exponent;					      \
+									      \
       if (exponent == 0)						      \
 	{								      \
 	  if (zero_mantissa)						      \
diff --git a/stdio-common/printf_fp.c b/stdio-common/printf_fp.c
index e20eab6..2b93e6c 100644
--- a/stdio-common/printf_fp.c
+++ b/stdio-common/printf_fp.c
@@ -332,8 +332,7 @@ ___printf_fp (FILE *fp,
       int res;
       if (__isnanl (fpnum.ldbl))
 	{
-	  union ieee854_long_double u = { .d = fpnum.ldbl };
-	  is_neg = u.ieee.negative != 0;
+	  is_neg = signbit (fpnum.ldbl);
 	  if (isupper (info->spec))
 	    {
 	      special = "NAN";
diff --git a/stdio-common/printf_fphex.c b/stdio-common/printf_fphex.c
index 3da2eec..50b6fbf 100644
--- a/stdio-common/printf_fphex.c
+++ b/stdio-common/printf_fphex.c
@@ -93,7 +93,7 @@ __printf_fphex (FILE *fp,
   union
     {
       union ieee754_double dbl;
-      union ieee854_long_double ldbl;
+      long double ldbl;
     }
   fpnum;
 
@@ -162,12 +162,11 @@ __printf_fphex (FILE *fp,
 #ifndef __NO_LONG_DOUBLE_MATH
   if (info->is_long_double && sizeof (long double) > sizeof (double))
     {
-      fpnum.ldbl.d = *(const long double *) args[0];
+      fpnum.ldbl = *(const long double *) args[0];
 
       /* Check for special values: not a number or infinity.  */
-      if (__isnanl (fpnum.ldbl.d))
+      if (__isnanl (fpnum.ldbl))
 	{
-	  negative = fpnum.ldbl.ieee.negative != 0;
 	  if (isupper (info->spec))
 	    {
 	      special = "NAN";
@@ -181,8 +180,7 @@ __printf_fphex (FILE *fp,
 	}
       else
 	{
-	  int res = __isinfl (fpnum.ldbl.d);
-	  if (res)
+	  if (__isinfl (fpnum.ldbl))
 	    {
 	      if (isupper (info->spec))
 		{
@@ -194,11 +192,9 @@ __printf_fphex (FILE *fp,
 		  special = "inf";
 		  wspecial = L"inf";
 		}
-	      negative = res < 0;
 	    }
-	  else
-	    negative = signbit (fpnum.ldbl.d);
 	}
+      negative = signbit (fpnum.ldbl);
     }
   else
 #endif	/* no long double */
diff --git a/stdio-common/printf_size.c b/stdio-common/printf_size.c
index 2c496e5..dfb3a53 100644
--- a/stdio-common/printf_size.c
+++ b/stdio-common/printf_size.c
@@ -103,7 +103,7 @@ __printf_size (FILE *fp, const struct printf_info *info,
   union
     {
       union ieee754_double dbl;
-      union ieee854_long_double ldbl;
+      long double ldbl;
     }
   fpnum;
   const void *ptr = &fpnum;
@@ -123,25 +123,25 @@ __printf_size (FILE *fp, const struct printf_info *info,
 #ifndef __NO_LONG_DOUBLE_MATH
   if (info->is_long_double && sizeof (long double) > sizeof (double))
     {
-      fpnum.ldbl.d = *(const long double *) args[0];
+      fpnum.ldbl = *(const long double *) args[0];
 
       /* Check for special values: not a number or infinity.  */
-      if (__isnanl (fpnum.ldbl.d))
+      if (__isnanl (fpnum.ldbl))
 	{
 	  special = "nan";
 	  wspecial = L"nan";
 	  // fpnum_sign = 0;	Already zero
 	}
-      else if ((res = __isinfl (fpnum.ldbl.d)))
+      else if ((res = __isinfl (fpnum.ldbl)))
 	{
 	  fpnum_sign = res;
 	  special = "inf";
 	  wspecial = L"inf";
 	}
       else
-	while (fpnum.ldbl.d >= divisor && tag[1] != '\0')
+	while (fpnum.ldbl >= divisor && tag[1] != '\0')
 	  {
-	    fpnum.ldbl.d /= divisor;
+	    fpnum.ldbl /= divisor;
 	    ++tag;
 	  }
     }
diff --git a/sysdeps/ieee754/ldbl-128/printf_fphex.c b/sysdeps/ieee754/ldbl-128/printf_fphex.c
index c9e09a4..e82228a 100644
--- a/sysdeps/ieee754/ldbl-128/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128/printf_fphex.c
@@ -24,13 +24,15 @@ do {									      \
 	 digits we use only the implicit digits for the number before	      \
 	 the decimal point.  */						      \
       unsigned long long int num0, num1;				      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
-      num0 = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
-      num1 = (((unsigned long long int) fpnum.ldbl.ieee.mantissa2) << 32      \
-	     | fpnum.ldbl.ieee.mantissa3);				      \
+      num0 = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
+      num1 = (((unsigned long long int) u.ieee.mantissa2) << 32		      \
+	     | u.ieee.mantissa3);					      \
 									      \
       zero_mantissa = (num0|num1) == 0;					      \
 									      \
@@ -75,9 +77,9 @@ do {									      \
 	  *--wnumstr = L'0';						      \
 	}								      \
 									      \
-      leading = fpnum.ldbl.ieee.exponent == 0 ? '0' : '1';		      \
+      leading = u.ieee.exponent == 0 ? '0' : '1';			      \
 									      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 453c2be..3a7a308 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -27,7 +27,7 @@ do {									      \
       unsigned long long hi, lo;					      \
       int ediff;							      \
       union ibm_extended_long_double u;					      \
-      u.ld = fpnum.ldbl.d;						      \
+      u.ld = fpnum.ldbl;						      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
diff --git a/sysdeps/ieee754/ldbl-96/printf_fphex.c b/sysdeps/ieee754/ldbl-96/printf_fphex.c
index f356a48..715c93b 100644
--- a/sysdeps/ieee754/ldbl-96/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-96/printf_fphex.c
@@ -25,11 +25,13 @@ do {									      \
       /* The "strange" 80 bit format on ix86 and m68k has an explicit	      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
       assert (sizeof (long double) == 12);				      \
 									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -62,7 +64,7 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
diff --git a/sysdeps/x86_64/fpu/printf_fphex.c b/sysdeps/x86_64/fpu/printf_fphex.c
index c85d1f7..be55f9c 100644
--- a/sysdeps/x86_64/fpu/printf_fphex.c
+++ b/sysdeps/x86_64/fpu/printf_fphex.c
@@ -25,10 +25,11 @@ do {									      \
       /* The "strange" 80 bit format on ix86 and m68k has an explicit	      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
-									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -61,7 +62,7 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e786250bef1b999044148cb1b5a67d2e71e2a42a

commit e786250bef1b999044148cb1b5a67d2e71e2a42a
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:17:01 2013 -0500

    PowerPC: LE memchr/memrchr
    
    Like strnlen, memchr and memrchr had a number of defects fixed by this
    patch as well as adding little-endian support.  The first one I
    noticed was that the entry to the main loop needlessly checked for
    "are we done yet?" when we know the size is large enough that we can't
    be done.  The second defect I noticed was that the main loop count was
    wrong, which in turn meant that the small loop needed to handle an
    extra word.  Thirdly, there is nothing to say that the string can't
    wrap around zero, except of course that we'd normally hit a segfault
    on trying to read from address zero.  Fixing that simplified a number
    of places:
    
    -	/* Are we done already?  */
    -	addi    r9,r8,8
    -	cmpld	r9,r7
    -	bge	L(null)
    
    becomes
    
    +	cmpld	r8,r7
    +	beqlr
    
    However, the exit gets an extra test because I test for being on the
    last word then if so whether the byte offset is less than the end.
    Overall, the change is a win.
    
    Lastly, memrchr used the wrong cache hint.  Note that a number of
    other power7 functions need their cache hints updating.

diff --git a/sysdeps/powerpc/powerpc32/power7/memchr.S b/sysdeps/powerpc/powerpc32/power7/memchr.S
index 369e5e0..85754f3 100644
--- a/sysdeps/powerpc/powerpc32/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/memchr.S
@@ -25,107 +25,111 @@ ENTRY (__memchr)
 	CALL_MCOUNT
 	dcbt	0,r3
 	clrrwi  r8,r3,2
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
 	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	insrdi	r4,r4,16,32
 	cmplwi	r5,16
+	li	r9, -1
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	slw	r9,r9,r6
+#else
+	srw	r9,r9,r6
+#endif
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Compare the starting address (r3) with the
-				 ending address (r7).  If (r3 >= r7), the size
-				 passed in is zero or negative.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Artificially set our ending address (r7)
-				 such that we will exit early. */
-L(proceed):
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	cmpli	cr6,r6,0      /* cr6 == Do we have padding?  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTEs in WORD1.  */
-	beq	cr6,L(proceed_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(proceed_no_padding):
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in WORD1.  */
+	and	r3,r3,r9
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-
 	bt	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 L(loop_setup):
-	sub	r5,r7,r9
-	srwi	r6,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r6,r7,r8
+	srwi	r6,r6,3	      /* Number of loop iterations.  */
 	mtctr	r6            /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since
-	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 	lwz	r12,4(r8)
 	lwzu	r11,8(r8)
-	cmpb	r10,r12,r4
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
-	cmplwi	cr7,r5,0
+	or	r6,r9,r3      /* Merge everything in one word.  */
+	cmplwi	cr7,r6,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
 
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  */
-	subi	r11,r7,4
-	cmplw	cr6,r8,r11
-	blt	cr6,L(loop_small)
-	b	L(null)
+	/* We may have one more dword to read.  */
+	cmplw	r8,r7
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
 
+	.align	4
+L(found):
 	/* OK, one (or both) of the words contains BYTE.  Check
 	   the first word and decrement the address in case the first
 	   word really contains BYTE.  */
-	.align	4
-L(found):
-	cmplwi	cr6,r10,0
+	cmplwi	cr6,r3,0
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntw	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzw	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmplw	r8,r7         /* Are we on the last word?  */
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r8,r0
-	cmplw	r3,r7
-	bge	L(null)
+	cmplw	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -137,67 +141,42 @@ L(null):
 	.align	4
 L(small_range):
 	cmplwi	r5,0
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	beq	L(null)       /* This branch is for the cmplwi r5,0 above */
+	beq	L(null)
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmplwi	cr6,r6,0      /* cr6 == Do we have padding?  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	beq	cr6,L(small_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(small_no_padding):
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplw	r8,r7         /* Are we done already?  */
 	bne	cr7,L(done)
+	beqlr
 
-	/* Are we done already?  */
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(null)
-
-L(loop_small):                /* loop_small has been unrolled.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
-	bge	L(null)
-
-	/* For most cases we will never get here.  Under some combinations of
-	   padding + length there is a leftover word that still needs to be
-	   checked.  */
-	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
-
-	/* save a branch and exit directly */
-	li	r3,0
 	blr
 
 END (__memchr)
diff --git a/sysdeps/powerpc/powerpc32/power7/memrchr.S b/sysdeps/powerpc/powerpc32/power7/memrchr.S
index defd832..8e0485a 100644
--- a/sysdeps/powerpc/powerpc32/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -23,117 +23,129 @@
 	.machine  power7
 ENTRY (__memrchr)
 	CALL_MCOUNT
-	dcbt	0,r3
-	mr	r7,r3
-	add	r3,r7,r5      /* Calculate the last acceptable address.  */
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  */
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrwi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
-	bge	cr7,L(proceed)
-
-	li	r3,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 	li	r6,-4
-	addi	r9,r3,-1
-	clrrwi  r8,r9,2
-	addi	r8,r8,4
-	neg	r0,r3
+	li	r9,-1
 	rlwinm	r0,r0,3,27,28 /* Calculate padding.  */
-
+	clrrwi	r8,r7,2
+	srw	r9,r9,r0
 	cmplwi	r5,16
+	clrrwi	r0,r10,2
 	ble	L(small_range)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,-4
-	cmplw	cr6,r9,r7
-	ble	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-	mr	r8,r9
-	bt	29,L(loop_setup)
+	bf	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,r8,r6
+#else
 	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
-	bne	cr7,L(done)
-
-	/* Are we done already?  */
+#endif
 	addi	r8,r8,-4
-	cmplw	cr6,r8,r7
-	ble	cr6,L(null)
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 
 L(loop_setup):
-	li	r0,-8
-	sub	r5,r8,r7
-	srwi	r9,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the word at
+	   s & ~3, or r0.  The first word read is at r8 - 4, we
+	   read 2 * cnt words, so the last word read will be at
+	   r8 - 4 - 8 * cnt + 4.  Solving for cnt gives
+	   cnt = (r8 - r0) / 8  */
+	sub	r5,r8,r0
+	addi	r8,r8,-4
+	srwi	r9,r5,3       /* Number of loop iterations.  */
 	mtctr	r9	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since it's a
-	   small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE backwards in the string.  */
+	.align	4
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 
-	lwbrx	r12,r8,r6
-	lwbrx	r11,r8,r0
-	addi	r8,r8,-4
-	cmpb	r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+	lwzx	r11,r8,r6
+#else
+	lwbrx	r12,0,r8
+	lwbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
+	or	r5,r9,r3      /* Merge everything in one word.  */
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
-	addi	r8,r8,-4
+	addi	r8,r8,-8
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  Just return
-	   the original range.  */
-	addi	r8,r8,4
-	cmplw	cr6,r8,r7
-	bgt	cr6,L(loop_small)
-	b	L(null)
 
-	/* OK, one (or both) of the words contains BYTE.  Check
-	   the first word and decrement the address in case the first
-	   word really contains BYTE.  */
+	/* We may have one more word to read.  */
+	cmplw	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
 	.align	4
 L(found):
-	cmplwi	cr6,r10,0
-	addi	r8,r8,4
+	/* OK, one (or both) of the words contains BYTE.  Check
+	   the first word.  */
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,-4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r6,r0,3	      /* Convert leading zeroes to bytes.  */
-	addi	r0,r6,1
+	cntlzw	r9,r3	      /* Count leading zeros before the match.  */
+	cmplw	r8,r0         /* Are we on the last word?  */
+	srwi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-3
 	sub	r3,r8,r0
-	cmplw	r3,r7
-	blt	L(null)
+	cmplw	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -147,28 +159,35 @@ L(small_range):
 	cmplwi	r5,0
 	beq	L(null)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for null bytes in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
+	/* Are we done already?  */
+	cmplw	r8,r0
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	beqlr
 
-	.p2align  5
+	.align	5
 L(loop_small):
-	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplw	r8,r0
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	bne	L(loop_small)
+	blr
 
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/powerpc/powerpc32/power7/rawmemchr.S b/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
index a80c74a..c2d8c4b 100644
--- a/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
@@ -27,16 +27,21 @@ ENTRY (__rawmemchr)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6      /* Move left to discard ignored bits.  */
 	srw	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
 	bne	cr7,L(done)
 
@@ -90,8 +95,14 @@ L(loop):
 	   word from the string.  Use that fact to find out what is
 	   the position of the byte inside the string.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching char.  */
 	blr
 END (__rawmemchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S
index 3416897..5076dd0 100644
--- a/sysdeps/powerpc/powerpc64/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -25,109 +25,112 @@ ENTRY (__memchr)
 	CALL_MCOUNT 2
 	dcbt	0,r3
 	clrrdi  r8,r3,3
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
 	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	insrdi	r4,r4,16,32
 	cmpldi	r5,32
+	li	r9, -1
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	insrdi  r4,r4,32,0
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r9,r6
+#else
+	srd	r9,r9,r6
+#endif
 	ble	L(small_range)
 
-	cmpld	cr7,r3,r7     /* Compare the starting address (r3) with the
-				 ending address (r7).  If (r3 >= r7),
-				 the size passed in was zero or negative.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1         /* Artificially set our ending address (r7)
-				 such that we will exit early.  */
-
-L(proceed):
-	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	cmpldi	cr6,r6,0      /* cr6 == Do we have padding?  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTEs in DWORD1.  */
-	beq	cr6,L(proceed_no_padding)
-	sld	r10,r10,r6
-	srd	r10,r10,r6
-L(proceed_no_padding):
-	cmpldi	cr7,r10,0     /* Does r10 indicate we got a hit?  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in DWORD1.  */
+	and	r3,r3,r9
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpldi	cr7,r3,0      /* Does r3 indicate we got a hit?  */
 	bne	cr7,L(done)
 
-	/* See if we are at the last acceptable address yet.  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-
 	bt	28,L(loop_setup)
 
 	/* Handle DWORD2 of pair.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	cmpldi	cr7,r10,0
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(null)
-
 L(loop_setup):
-	sub	r5,r7,r9
-	srdi	r6,r5,4	      /* Number of loop iterations.  */
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r6,r7,r8
+	srdi	r6,r6,4	      /* Number of loop iterations.  */
 	mtctr	r6            /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since
-	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
 L(loop):
 	/* Load two doublewords, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 	ld	r12,8(r8)
 	ldu	r11,16(r8)
-	cmpb	r10,r12,r4
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one doubleword.  */
-	cmpldi	cr7,r5,0
+	or	r6,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r6,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
 
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  */
-	subi	r11,r7,8
-	cmpld	cr6,r8,r11
-	blt	cr6,L(loop_small)
-	b	L(null)
+	/* We may have one more dword to read.  */
+	cmpld	r8,r7
+	beqlr
 
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+	.align	4
+L(found):
 	/* OK, one (or both) of the doublewords contains BYTE.  Check
 	   the first doubleword and decrement the address in case the first
 	   doubleword really contains BYTE.  */
-	.align	4
-L(found):
-	cmpldi	cr6,r10,0
+	cmpldi	cr6,r3,0
 	addi	r8,r8,-8
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second doubleword.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,8
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   doubleword from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmpld	r8,r7         /* Are we on the last dword?  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r8,r0
-	cmpld	r3,r7
-	bge	L(null)
+	cmpld	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -139,67 +142,44 @@ L(null):
 	.align	4
 L(small_range):
 	cmpldi	r5,0
-	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	beq	L(null)       /* This branch is for the cmpldi r5,0 above.  */
+	beq	L(null)
 	ld	r12,0(r8)     /* Load word from memory.  */
-	cmpldi	cr6,r6,0      /* cr6 == Do we have padding?  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-			      /* If no padding, skip the shifts.  */
-	beq	cr6,L(small_no_padding)
-	sld	r10,r10,r6
-	srd	r10,r10,r6
-L(small_no_padding):
-	cmpldi	cr7,r10,0
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpld	r8,r7         /* Are we done already?  */
 	bne	cr7,L(done)
+	beqlr
 
-	/* Are we done already?  */
-	addi    r9,r8,8
-	cmpld	r9,r7
-	bge	L(null)
-	/* If we're not done, drop through into loop_small.  */
-
-L(loop_small):                /* loop_small has been unrolled.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
 	bne	cr6,L(done)   /* Found something.  */
-	bge	L(null)       /* Hit end of string (length).  */
+	beqlr		      /* Hit end of string (length).  */
 
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
-	bne	cr6,L(done)   /* Found something.  */
-	bge	L(null)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
 
 	ldu	r12,8(r8)
-	subi	r11,r7,8
-	cmpb	r10,r12,r4
-	cmpldi	cr6,r10,0
-	ori	r2,r2,0       /* Force a dispatch group.  */
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
 	bne	cr6,L(done)
+	beqlr
 
-	cmpld	r8,r11        /* At end of range?  */
-	bge	L(null)
-
-	/* For most cases we will never get here.  Under some combinations of
-	   padding + length there is a leftover double that still needs to be
-	   checked.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
-	bne	cr6,L(done)   /* Found something.  */
-
-	/* Save a branch and exit directly.  */
-	li	r3,0
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
 	blr
 
-
 END (__memchr)
 weak_alias (__memchr, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
index c499952..7a0050e 100644
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -23,118 +23,130 @@
 	.machine  power7
 ENTRY (__memrchr)
 	CALL_MCOUNT
-	dcbt	0,r3
-	mr	r7,r3
-	add	r3,r7,r5      /* Calculate the last acceptable address.  */
-	cmpld	cr7,r3,r7     /* Is the address equal or less than r3?  */
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrdi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi  r4,r4,32,0
-	bge	cr7,L(proceed)
-
-	li	r3,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	li	r6,-8
-	addi	r9,r3,-1
-	clrrdi  r8,r9,3
-	addi	r8,r8,8
-	neg	r0,r3
+	li	r9,-1
 	rlwinm	r0,r0,3,26,28 /* Calculate padding.  */
-
+	clrrdi	r8,r7,3
+	srd	r9,r9,r0
 	cmpldi	r5,32
+	clrrdi	r0,r10,3
 	ble	L(small_range)
 
-	ldbrx	r12,r8,r6     /* Load reversed doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	sld	r10,r10,r0
-	srd	r10,r10,r0
-	cmpldi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,-8
-	cmpld	cr6,r9,r7
-	ble	cr6,L(null)
-
 	mtcrf   0x01,r8
-	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-	mr	r8,r9
-	bt	28,L(loop_setup)
+	bf	28,L(loop_setup)
 
 	/* Handle DWORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,r8,r6
+#else
 	ldbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmpldi	cr7,r10,0
-	bne	cr7,L(done)
-
-	/* Are we done already.  */
+#endif
 	addi	r8,r8,-8
-	cmpld	cr6,r8,r7
-	ble	cr6,L(null)
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
 
 L(loop_setup):
-	li	r0,-16
-	sub	r5,r8,r7
-	srdi	r9,r5,4	      /* Number of loop iterations.  */
+	/* The last dword we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the dword at
+	   s & ~7, or r0.  The first dword read is at r8 - 8, we
+	   read 2 * cnt dwords, so the last dword read will be at
+	   r8 - 8 - 16 * cnt + 8.  Solving for cnt gives
+	   cnt = (r8 - r0) / 16  */
+	sub	r5,r8,r0
+	addi	r8,r8,-8
+	srdi	r9,r5,4       /* Number of loop iterations.  */
 	mtctr	r9	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since it's a
-	   small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE backwards in the string.  */
+	.align	4
 L(loop):
 	/* Load two doublewords, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 
-	ldbrx	r12,r8,r6
-	ldbrx	r11,r8,r0
-	addi	r8,r8,-8
-	cmpb	r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+	ldx	r11,r8,r6
+#else
+	ldbrx	r12,0,r8
+	ldbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one doubleword.  */
+	or	r5,r9,r3      /* Merge everything in one doubleword.  */
 	cmpldi	cr7,r5,0
 	bne	cr7,L(found)
-	addi	r8,r8,-8
+	addi	r8,r8,-16
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  Just return
-	   the original range.  */
-	addi	r8,r8,8
-	cmpld	cr6,r8,r7
-	bgt	cr6,L(loop_small)
-	b	L(null)
-
-	/* OK, one (or both) of the words contains BYTE.  Check
-	   the first word and decrement the address in case the first
-	   word really contains BYTE.  */
+
+	/* We may have one more word to read.  */
+	cmpld	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
 	.align	4
 L(found):
-	cmpldi	cr6,r10,0
-	addi	r8,r8,8
+	/* OK, one (or both) of the dwords contains BYTE.  Check
+	   the first dword.  */
+	cmpldi	cr6,r3,0
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,-8
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
-	   0xff in the same position as the BYTE in the original
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the
 	   range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r6,r0,3	      /* Convert leading zeroes to bytes.  */
-	addi	r0,r6,1
+	cntlzd	r9,r3	      /* Count leading zeros before the match.  */
+	cmpld	r8,r0         /* Are we on the last word?  */
+	srdi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-7
 	sub	r3,r8,r0
-	cmpld	r3,r7
-	blt	L(null)
+	cmpld	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -148,29 +160,35 @@ L(small_range):
 	cmpldi	r5,0
 	beq	L(null)
 
-	ldbrx	r12,r8,r6     /* Load reversed doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	sld	r10,r10,r0
-	srd	r10,r10,r0
-	cmpldi	cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
 	bne	cr7,L(done)
 
 	/* Are we done already?  */
+	cmpld	r8,r0
 	addi	r8,r8,-8
-	cmpld	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	beqlr
 
-	.p2align  5
+	.align	5
 L(loop_small):
-	ldbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmpldi	cr6,r10,0
-	bne	cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpld	r8,r0
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
 	addi	r8,r8,-8
-	cmpld	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	bne	L(loop_small)
+	blr
 
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
index 50a33d8..547aed7 100644
--- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -27,8 +27,8 @@ ENTRY (__rawmemchr)
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi	r4,r4,32,0
 
 	/* Now r4 has a doubleword of c bytes.  */
@@ -36,8 +36,13 @@ ENTRY (__rawmemchr)
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
 	sld	r5,r5,r6      /* Move left to discard ignored bits.  */
 	srd	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
 	bne	cr7,L(done)
 
@@ -91,8 +96,14 @@ L(loop):
 	   doubleword from the string.  Use that fact to find out what is
 	   the position of the byte inside the string.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching char.  */
 	blr
 END (__rawmemchr)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e07a300b6b28182601632e939ca0933d76545c23

commit e07a300b6b28182601632e939ca0933d76545c23
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:16:41 2013 -0500

    PowerPC: LE memset
    
    One of the things I noticed when looking at power7 timing is that rlwimi
    is cracked and the two resulting insns have a register dependency.
    That makes it a little slower than the equivalent rldimi.

diff --git a/sysdeps/powerpc/powerpc32/power4/memset.S b/sysdeps/powerpc/powerpc32/power4/memset.S
index c2d288b..4fd9d8c 100644
--- a/sysdeps/powerpc/powerpc32/power4/memset.S
+++ b/sysdeps/powerpc/powerpc32/power4/memset.S
@@ -50,7 +50,7 @@ L(_memset):
 
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48     /* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -65,7 +65,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32    /* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/sysdeps/powerpc/powerpc32/power6/memset.S b/sysdeps/powerpc/powerpc32/power6/memset.S
index 8c23c8d..a4b002a 100644
--- a/sysdeps/powerpc/powerpc32/power6/memset.S
+++ b/sysdeps/powerpc/powerpc32/power6/memset.S
@@ -48,7 +48,7 @@ L(_memset):
 	ble-	cr1, L(small)
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -64,7 +64,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
index 360ea71..aadda25 100644
--- a/sysdeps/powerpc/powerpc32/power7/memset.S
+++ b/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -35,8 +35,8 @@ L(_memset):
 	cfi_offset(31,-8)
 
 	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
 
 	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 
diff --git a/sysdeps/powerpc/powerpc64/memset.S b/sysdeps/powerpc/powerpc64/memset.S
index 6acf149..1027a59 100644
--- a/sysdeps/powerpc/powerpc64/memset.S
+++ b/sysdeps/powerpc/powerpc64/memset.S
@@ -55,14 +55,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -84,14 +84,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -212,7 +212,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S
index dbecee8..ad0d381 100644
--- a/sysdeps/powerpc/powerpc64/power4/memset.S
+++ b/sysdeps/powerpc/powerpc64/power4/memset.S
@@ -50,14 +50,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -79,14 +79,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -146,24 +146,24 @@ L(zloopstart):
 L(getCacheAligned):
 	cmpldi	cr1,rLEN,32
 	andi.	rTMP,rMEMP,127
-	blt		cr1,L(handletail32)
-	beq		L(cacheAligned)
+	blt	cr1,L(handletail32)
+	beq	L(cacheAligned)
 	addi	rMEMP,rMEMP,32
 	addi	rLEN,rLEN,-32
-	std		rCHR,-32(rMEMP)
-	std		rCHR,-24(rMEMP)
-	std		rCHR,-16(rMEMP)
-	std		rCHR,-8(rMEMP)
-	b		L(getCacheAligned)
+	std	rCHR,-32(rMEMP)
+	std	rCHR,-24(rMEMP)
+	std	rCHR,-16(rMEMP)
+	std	rCHR,-8(rMEMP)
+	b	L(getCacheAligned)
 
 /* Now we are aligned to the cache line and can use dcbz.  */
 L(cacheAligned):
 	cmpld	cr1,rLEN,rCLS
-	blt		cr1,L(handletail32)
+	blt	cr1,L(handletail32)
 	dcbz	0,rMEMP
 	subf	rLEN,rCLS,rLEN
-	add		rMEMP,rMEMP,rCLS
-	b		L(cacheAligned)
+	add	rMEMP,rMEMP,rCLS
+	b	L(cacheAligned)
 
 /* We are here because the cache line size was set and was not 32-bytes
    and the remainder (rLEN) is less than the actual cache line size.
@@ -200,7 +200,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S
index 3e8ae2d..d61988a 100644
--- a/sysdeps/powerpc/powerpc64/power6/memset.S
+++ b/sysdeps/powerpc/powerpc64/power6/memset.S
@@ -47,14 +47,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -76,14 +76,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -344,7 +344,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index b24cfa1..8b081e8 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -32,8 +32,8 @@ L(_memset):
 	mr	10,3
 
 	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
 	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 
 	neg	0,3
@@ -321,7 +321,7 @@ L(medium):
 	clrldi	0,0,62
 	beq	L(medium_aligned)
 
-	/* Force 4-bytes alignment for SRC.  */
+	/* Force 4-bytes alignment for DST.  */
 	mtocrf	0x01,0
 	subf	5,0,5
 1:	/* Copy 1 byte.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=353f8c8865c510762346c7172420ecefc8a1c762

commit 353f8c8865c510762346c7172420ecefc8a1c762
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:16:22 2013 -0500

    PowerPC: LE memcpy
    
    Little-endian support for memcpy.  I spent some time cleaning up the
    64-bit power7 memcpy, in order to avoid the extra alignment traps
    power7 takes for little-endian.

diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d914663..338d3cc 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
     bf      30,1f
 
     /* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   6,8(5)  /* load the 3rd src word */
     stw   0,0(4)  /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10  /* now left align 2nd src word into R0 */
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   7,12(5)
     stw   0,4(4)  /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
     addi  5,5,16
     bf    31,4f
     /* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     stw   0,0(4)  /* store 3rd dst word */
     mr    6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
     b     4f
     .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srw     0,6,10
+    slw     8,7,9
+#else
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     addi  5,5,8
     or    0,0,8   /* or them to get word to store */
     bf    31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
     .align  4
 4:
     /* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,0(5)
     stw   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,4(5)
     stw   0,4(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,8(5)
     stw   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,12(5)
     stw   0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
     bdnz+ 4b
 8:
     /* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     stw   0,0(4)
 3:
diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S
index a76f71e..f58114a 100644
--- a/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
     lwz     6,-1(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,8
+#else
     slwi    6,6,8
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu1_32tail)
     mtctr   8
@@ -585,8 +602,12 @@ L(wdu1_32):
 
     lwz   8,3(4)
     lwz   7,4(4)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
     b      L(wdu1_loop32x)
     .align  4
 L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
 L(wdu1_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,8
+#else
     slwi  6,8,8
+#endif
     bdnz+ L(wdu1_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,3(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
     rlwimi 6,8,8,(32-8),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
     lwz     6,-2(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,16
+#else
     slwi    6,6,16
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu2_32tail)
     mtctr   8
@@ -641,8 +678,11 @@ L(wdu2_32):
 
     lwz   8,2(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b      L(wdu2_loop32x)
     .align  4
 L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
 L(wdu2_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,16
+#else
     slwi  6,8,16
+#endif
     bdnz+ L(wdu2_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,2(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
     lwz     6,-3(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,24
+#else
     slwi    6,6,24
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu3_32tail)
     mtctr   8
@@ -698,8 +752,11 @@ L(wdu3_32):
 
     lwz   8,1(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b      L(wdu3_loop32x)
     .align  4
 L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
 L(wdu3_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,24
+#else
     slwi  6,8,24
+#endif
     bdnz+ L(wdu3_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,1(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b     L(wdu_32tailx)
     .align  4
 L(wdu_32tailx):
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f00778..acf3c10 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
 
 	beq    L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf   0x01,0
 	subf    31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
 	mr      11,12
 	mtcrf   0x01,9
 	cmplwi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
 	lvsl    5,0,12
+#endif
 	lvx     3,0,12
 	bf      31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx     4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
 	vperm   6,3,4,5
+#endif
 	addi    11,12,16
 	addi    10,3,16
 	stvx    6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi    11,11,32
 	stvx    6,0,10
 	stvx    10,10,6
diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb..4610ec5 100644
--- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
 
 	beq	L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf	0x01,0
 	subf	31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmplwi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6
diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S
index b8c4cc8..5fc7401 100644
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
     ld    7,8(5)
     subfic  9,10,64
     beq   2f
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+#else
     sld   0,6,10
+#endif
     cmpldi  11,1
     mr    6,7
     addi  4,4,-8
@@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
     b     1f
 2:  addi  5,5,8
     .align  4
+#ifdef __LITTLE_ENDIAN__
+0:  srd   0,6,10
+    sld   8,7,9
+#else
 0:  sld   0,6,10
     srd   8,7,9
+#endif
     cmpldi  11,2
     ld    6,8(5)
     or    0,0,8
     addi  11,11,-2
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+1:  sld   8,6,9
+#else
     sld   0,7,10
 1:  srd   8,6,9
+#endif
     or    0,0,8
     beq   8f
     ld    7,16(5)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S
index 4317c7e..f9a7260 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S
@@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
     bf      30,1f
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd     0,7,10
+    sld     8,6,9
+#else
     sld     0,7,10
     srd     8,6,9
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
     bf      31,4f
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
     b       4f
     .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,4f
@@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
     addi    4,4,8
     .align 4
 /* copy 32 bytes at a time */
-4:  sld   0,6,10
+4:
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
+    sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
     sld   0,7,10
     srd   8,6,9
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
     sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
     sld   0,7,10
     srd   8,6,9
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
     .align 4
 8:
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
     sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     std   0,0(4)
 3:
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S
index d6d242d..e3f3d8a 100644
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@@ -400,15 +400,28 @@ L(das_tail2):
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -595,13 +608,24 @@ L(du1_do):
     bf      30,L(du1_1dw)
 
     /* there are at least two DWs to copy */
+    /* FIXME: can combine last shift and "or" into "rldimi" */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 8
+    sldi     8,6, 64-8
+#else
     sldi     0,7, 8
     srdi     8,6, 64-8
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -610,8 +634,13 @@ L(du1_do):
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du1_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -622,8 +651,13 @@ L(du1_do):
     b       L(du1_loop)
     .align 4
 L(du1_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du1_loop)
@@ -635,23 +669,43 @@ L(du1_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du1_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
     sldi   0,7, 8
     srdi   8,6, 64-8
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
     sldi   0,7, 8
     srdi   8,6, 64-8
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -661,8 +715,13 @@ L(du1_loop):
     .align 4
 L(du1_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -672,13 +731,23 @@ L(du2_do):
     bf      30,L(du2_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 16
+    sldi     8,6, 64-16
+#else
     sldi     0,7, 16
     srdi     8,6, 64-16
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -687,8 +756,13 @@ L(du2_do):
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du2_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -699,8 +773,13 @@ L(du2_do):
     b       L(du2_loop)
     .align 4
 L(du2_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du2_loop)
@@ -712,23 +791,43 @@ L(du2_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du2_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
     sldi   0,7, 16
     srdi   8,6, 64-16
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
     sldi   0,7, 16
     srdi   8,6, 64-16
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -738,8 +837,13 @@ L(du2_loop):
     .align 4
 L(du2_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -749,13 +853,23 @@ L(du3_do):
     bf      30,L(du3_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 24
+    sldi     8,6, 64-24
+#else
     sldi     0,7, 24
     srdi     8,6, 64-24
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -764,8 +878,13 @@ L(du3_do):
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du3_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -776,8 +895,13 @@ L(du3_do):
     b       L(du3_loop)
     .align 4
 L(du3_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du3_loop)
@@ -789,23 +913,43 @@ L(du3_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du3_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
     sldi   0,7, 24
     srdi   8,6, 64-24
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
     sldi   0,7, 24
     srdi   8,6, 64-24
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -815,8 +959,13 @@ L(du3_loop):
     .align 4
 L(du3_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -832,13 +981,23 @@ L(du4_dox):
     bf      30,L(du4_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 32
+    sldi     8,6, 64-32
+#else
     sldi     0,7, 32
     srdi     8,6, 64-32
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -847,8 +1006,13 @@ L(du4_dox):
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du4_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -859,8 +1023,13 @@ L(du4_dox):
     b       L(du4_loop)
     .align 4
 L(du4_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du4_loop)
@@ -872,23 +1041,43 @@ L(du4_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du4_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
     sldi   0,7, 32
     srdi   8,6, 64-32
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
     sldi   0,7, 32
     srdi   8,6, 64-32
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -898,8 +1087,13 @@ L(du4_loop):
     .align 4
 L(du4_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -909,13 +1103,23 @@ L(du5_do):
     bf      30,L(du5_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 40
+    sldi     8,6, 64-40
+#else
     sldi     0,7, 40
     srdi     8,6, 64-40
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -924,8 +1128,13 @@ L(du5_do):
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du5_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -936,8 +1145,13 @@ L(du5_do):
     b       L(du5_loop)
     .align 4
 L(du5_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du5_loop)
@@ -949,23 +1163,43 @@ L(du5_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du5_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
     sldi   0,7, 40
     srdi   8,6, 64-40
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
     sldi   0,7, 40
     srdi   8,6, 64-40
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -975,8 +1209,13 @@ L(du5_loop):
     .align 4
 L(du5_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -986,13 +1225,23 @@ L(du6_do):
     bf      30,L(du6_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 48
+    sldi     8,6, 64-48
+#else
     sldi     0,7, 48
     srdi     8,6, 64-48
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -1001,8 +1250,13 @@ L(du6_do):
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du6_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -1013,8 +1267,13 @@ L(du6_do):
     b       L(du6_loop)
     .align 4
 L(du6_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du6_loop)
@@ -1026,23 +1285,43 @@ L(du6_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du6_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
     sldi   0,7, 48
     srdi   8,6, 64-48
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
     sldi   0,7, 48
     srdi   8,6, 64-48
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -1052,8 +1331,13 @@ L(du6_loop):
     .align 4
 L(du6_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -1063,13 +1347,23 @@ L(du7_do):
     bf      30,L(du7_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 56
+    sldi     8,6, 64-56
+#else
     sldi     0,7, 56
     srdi     8,6, 64-56
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -1078,8 +1372,13 @@ L(du7_do):
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du7_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -1090,8 +1389,13 @@ L(du7_do):
     b       L(du7_loop)
     .align 4
 L(du7_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du7_loop)
@@ -1103,23 +1407,43 @@ L(du7_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du7_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
     sldi   0,7, 56
     srdi   8,6, 64-56
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
     sldi   0,7, 56
     srdi   8,6, 64-56
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -1129,8 +1453,13 @@ L(du7_loop):
     .align 4
 L(du7_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 800a9f1..e8df75f 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -23,418 +23,361 @@
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
    Returns 'dst'.  */
 
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
 	.machine power7
 EALIGN (memcpy, 5, 0)
 	CALL_MCOUNT 3
 
-	cmpldi  cr1,5,31
+	cmpldi	cr1,cnt,31
 	neg	0,3
-	std	3,-16(1)
-	std	31,-8(1)
-	cfi_offset(31,-8)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */
 
-	andi.   11,3,7	      /* Check alignment of DST.  */
-
-
-	clrldi  10,4,61       /* Check alignment of SRC.  */
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
-	mr	12,4
-	mr	31,5
+#ifdef __LITTLE_ENDIAN__
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
+   loop is only used for quadword aligned copies.  */
+	andi.	10,3,15
+	clrldi	11,4,60
+#else
+	andi.	10,3,7		/* Check alignment of DST.  */
+	clrldi	11,4,61		/* Check alignment of SRC.  */
+#endif
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
 	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
 
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
-
-	beq    L(copy_GE_32_aligned_cont)
-
-	clrldi  0,0,61
-	mtcrf   0x01,0
-	subf    31,0,5
-
-	/* Get the SRC aligned to 8 bytes.  */
-
-1:	bf	31,2f
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	bf      30,4f
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	bf      29,0f
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-0:
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
-
-L(copy_GE_32_aligned_cont):
-
-	clrldi  11,31,61
-	mtcrf   0x01,9
-
-	srdi    8,31,5
-	cmpldi  cr1,9,4
-	cmpldi  cr6,11,0
-	mr	11,12
-
-	/* Copy 1~3 doublewords so the main loop starts
-	at a multiple of 32 bytes.  */
+	mtocrf	0x01,0
+#ifdef __LITTLE_ENDIAN__
+	clrldi	0,0,60
+#else
+	clrldi	0,0,61
+#endif
 
-	bf	30,1f
-	ld      6,0(12)
-	ld      7,8(12)
-	addi    11,12,16
-	mtctr   8
-	std     6,0(3)
-	std     7,8(3)
-	addi    10,3,16
-	bf      31,4f
-	ld      0,16(12)
-	std     0,16(3)
-	blt     cr1,3f
-	addi    11,12,24
-	addi    10,3,24
-	b       4f
-
-	.align  4
-1:	/* Copy 1 doubleword and set the counter.  */
-	mr	10,3
-	mtctr   8
-	bf      31,4f
-	ld      6,0(12)
-	addi    11,12,8
-	std     6,0(3)
-	addi    10,3,8
-
-L(aligned_copy):
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
-	.align  4
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
 4:
-	/* check for any 32-byte or 64-byte lumps that are outside of a
-	   nice 128-byte range.  R8 contains the number of 32-byte
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
-	   unrolled 128-bytes-at-a-time copy loop. */
-	mtocrf	1,8
-	li	6,16	# 16() index
-	li	7,32	# 32() index
-	li	8,48	# 48() index
-
-L(aligned_32byte):
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
-	bns	cr7,L(aligned_64byte)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	11,11,32
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	addi	10,10,32
-
-L(aligned_64byte):
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
-	bne	cr7,L(aligned_128setup)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
-
-L(aligned_128setup):
-	/* Set up for the 128-byte at a time copy loop.  */
-	srdi	8,31,7
-	cmpdi	8,0	# Any 4x lumps left?
-	beq	3f	# if not, move along.
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	mtctr	8	# otherwise, load the ctr and begin.
-	li	8,48	# 48() index
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+#ifdef __LITTLE_ENDIAN__
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+#endif
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	mtctr	12
 	b	L(aligned_128loop)
 
+	.align  4
 L(aligned_128head):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
 L(aligned_128loop):
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	stxvd2x	6,0,10
-	addi	11,11,64
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	10,10,64
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	stxvd2x	6,0,dst
+	addi	src,src,64
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	dst,dst,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
 	bdnz	L(aligned_128head)
 
-3:
-	/* Check for tail bytes.  */
-	rldicr  0,31,0,60
-	mtcrf   0x01,31
-	beq	cr6,0f
-
-.L9:
-	add	3,3,0
-	add	12,12,0
-
-	/*  At this point we have a tail of 0-7 bytes and we know that the
-	destination is doubleword-aligned.  */
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2 bytes.  */
-	bf	30,1f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	src,src,32
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,src
+	addi	src,src,16
+	stxvd2x	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
 	blr
 
-	/* Handle copies of 0~31 bytes.  */
-	.align  4
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
 L(copy_LT_32):
-	cmpldi  cr6,5,8
-	mr	12,4
-	mtcrf   0x01,5
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
 	ble	cr6,L(copy_LE_8)
 
 	/* At least 9 bytes to go.  */
 	neg	8,4
-	clrrdi  11,4,2
-	andi.   0,8,3
-	cmpldi  cr1,5,16
-	mr	10,5
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
 	beq	L(copy_LT_32_aligned)
 
-	/* Force 4-bytes alignment for SRC.  */
-	mtocrf  0x01,0
-	subf    10,0,5
-2:	bf	30,1f
-
-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	bf	31,L(end_4bytes_alignment)
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-
-	.align  4
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
 L(end_4bytes_alignment):
-	cmpldi  cr1,10,16
-	mtcrf   0x01,10
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
 
 L(copy_LT_32_aligned):
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
 	blt	cr1,8f
 
 	/* Copy 16 bytes.  */
-	lwz	6,0(12)
-	lwz     7,4(12)
-	stw     6,0(3)
-	lwz     8,8(12)
-	stw     7,4(3)
-	lwz     6,12(12)
-	addi    12,12,16
-	stw     8,8(3)
-	stw     6,12(3)
-	addi    3,3,16
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
 8:	/* Copy 8 bytes.  */
-	bf	28,4f
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
 
-	lwz     6,0(12)
-	lwz     7,4(12)
-	addi    12,12,8
-	stw     6,0(3)
-	stw     7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2-3 bytes.  */
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
 	bf	30,1f
-
-	lhz     6,0(12)
-	sth     6,0(3)
-	bf      31,0f
-	lbz     7,2(12)
-	stb     7,2(3)
-	ld	3,-16(1)
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
 	blr
 
-	.align  4
-1:	/* Copy 1 byte.  */
-	bf	31,0f
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
 
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
 	blr
 
-	/* Handles copies of 0~8 bytes.  */
-	.align  4
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
 L(copy_LE_8):
-	bne	cr6,4f
+	bne	cr6,L(tail4)
 
 	/* Though we could've used ld/std here, they are still
 	slow for unaligned cases.  */
 
-	lwz	6,0(4)
-	lwz     7,4(4)
-	stw     6,0(3)
-	stw     7,4(3)
-	ld      3,-16(1)      /* Return original DST pointers.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
 	blr
 
-	.align  4
-4:	/* Copies 4~7 bytes.  */
-	bf	29,2b
-
-	lwz	6,0(4)
-	stw     6,0(3)
-	bf      30,5f
-	lhz     7,4(4)
-	sth     7,4(3)
-	bf      31,0f
-	lbz     8,6(4)
-	stb     8,6(3)
-	ld	3,-16(1)
-	blr
-
-	.align  4
-5:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,4(4)
-	stb	6,4(3)
-
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
-	blr
 
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
-	the data, allowing for aligned DST stores.  */
-	.align  4
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
 L(copy_GE_32_unaligned):
-	clrldi  0,0,60	      /* Number of bytes until the 1st
-			      quadword.  */
-	andi.   11,3,15       /* Check alignment of DST (against
-			      quadwords).  */
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+#ifndef __LITTLE_ENDIAN__
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
+#endif
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	beq	L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
-	mtcrf   0x01,0
-	subf    31,0,5
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
 
 	/* Vector instructions work best when proper alignment (16-bytes)
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
-1:	/* Copy 1 byte.  */
+1:
 	bf	31,2f
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	/* Copy 2 bytes.  */
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
 	bf	30,4f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	/* Copy 4 bytes.  */
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
 	bf	29,8f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-8:	/* Copy 8 bytes.  */
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
 	bf	28,0f
-
-	ld	6,0(12)
-	addi    12,12,8
-	std	6,0(3)
-	addi    3,3,8
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
 0:
-	clrldi  10,12,60      /* Check alignment of SRC.  */
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
 L(copy_GE_32_unaligned_cont):
 
 	/* Setup two indexes to speed up the indexed vector operations.  */
-	clrldi  11,31,60
-	li      6,16	      /* Index for 16-bytes offsets.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
 	li	7,32	      /* Index for 32-bytes offsets.  */
-	cmpldi  cr1,11,0
-	srdi    8,31,5	      /* Setup the loop counter.  */
-	mr      10,3
-	mr      11,12
-	mtcrf   0x01,9
-	cmpldi  cr6,9,1
-	lvsl    5,0,12
-	lvx     3,0,12
-	bf      31,L(setup_unaligned_loop)
-
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
-	lvx     4,12,6
-	vperm   6,3,4,5
-	addi    11,12,16
-	addi    10,3,16
-	stvx    6,0,3
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
 	vor	3,4,4
+	clrrdi	0,src,60
 
 L(setup_unaligned_loop):
-	mtctr   8
-	ble     cr6,L(end_unaligned_loop)
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
 
 	/* Copy 32 bytes at a time using vector instructions.  */
-	.align  4
+	.align	4
 L(unaligned_loop):
 
 	/* Note: vr6/vr10 may contain data that was already copied,
@@ -442,62 +385,55 @@ L(unaligned_loop):
 	some portions again. This is faster than having unaligned
 	vector instructions though.  */
 
-	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
-	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
-	addi    11,11,32
-	stvx    6,0,10
-	stvx    10,10,6
-	addi    10,10,32
-
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
 	bdnz	L(unaligned_loop)
 
-	.align  4
+	clrrdi	0,src,60
+
+	.align	4
 L(end_unaligned_loop):
 
 	/* Check for tail bytes.  */
-	rldicr  0,31,0,59
-	mtcrf   0x01,31
-	beq	cr1,0f
+	mtocrf	0x01,cnt
+	beqlr	cr1
 
-	add	3,3,0
-	add	12,12,0
+	add	src,src,0
 
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
-8:	/* Copy 8 bytes.  */
+	/* Copy 8 bytes.  */
 	bf	28,4f
-
-	lwz	6,0(12)
-	lwz	7,4(12)
-	addi    12,12,8
-	stw	6,0(3)
-	stw	7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz	6,0(12)
-	addi    12,12,4
-	stw	6,0(3)
-	addi    3,3,4
-2:	/* Copy 2~3 bytes.  */
-	bf	30,1f
-
-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
 	blr
 
 END_GEN_TB (memcpy,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
index f20be93..b93ab7d 100644
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmpldi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@@ -391,11 +399,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f316eedfc4a5e8871729c6a3cffc4bbc27e2813f

commit f316eedfc4a5e8871729c6a3cffc4bbc27e2813f
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:16:05 2013 -0500

    PowerPC: LE strchr
    
    Adds little-endian support to optimised strchr assembly.  I've also
    tweaked the big-endian code a little.  In power7/strchr.S there's a
    check in the tail of the function that we didn't match 0 before
    finding a c match, done by comparing leading zero counts.  It's just
    as valid, and quicker, to compare the raw output from cmpb.
    
    Another little tweak is to use rldimi/insrdi in place of rlwimi for
    the power7 strchr functions.  Since rlwimi is cracked, it is a few
    cycles slower.  rldimi can be used on the 32-bit power7 functions
    too.

diff --git a/sysdeps/powerpc/powerpc32/power7/strchr.S b/sysdeps/powerpc/powerpc32/power7/strchr.S
index 0ecadb2..b662659 100644
--- a/sysdeps/powerpc/powerpc32/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
 	beq	cr7,L(null_match)
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes and r0 has
 	   a word of null bytes.  */
@@ -46,11 +46,17 @@ ENTRY (strchr)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and to bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r11,r11,r6
+	slw	r10,r10,r6
+	slw	r11,r11,r6
+#else
 	slw	r10,r10,r6
 	slw	r11,r11,r6
 	srw	r10,r10,r6
 	srw	r11,r11,r6
+#endif
 	or	r5,r10,r11    /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -65,7 +71,7 @@ ENTRY (strchr)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb    r10,r12,r4
+	cmpb	r10,r12,r4
 	cmpb	r11,r12,r0
 	or	r5,r10,r11
 	cmpwi	cr7,r5,0
@@ -100,22 +106,31 @@ L(loop):
 	bne	cr6,L(done)
 
 	/* The c/null byte must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
-	   pointer.  */
+	   again and move the result of cmpb to r10/r11 so we can calculate
+	   the pointer.  */
 
 	mr	r10,r6
 	mr	r11,r7
 	addi	r8,r8,4
 
-	/* r5 has the output of the cmpb instruction, that is, it contains
+	/* r10/r11 have the output of the cmpb instructions, that is,
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
-	cmplw	cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntw	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmplw	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
+	cmplw	cr7,r11,r10
 	bgt	cr7,L(no_match)
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching c byte
 				 or null in case c was not found.  */
 	blr
@@ -133,10 +148,14 @@ L(null_match):
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
 
 	/* Move the words left and right to discard the bits that are
-	   not part of the string and to bring them back as zeros.  */
-
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6
 	srw	r5,r5,r6
+#endif
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
 				 have been found.  */
 	bne	cr7,L(done_null)
@@ -191,7 +210,13 @@ L(loop_null):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc32/power7/strchrnul.S b/sysdeps/powerpc/powerpc32/power7/strchrnul.S
index d4cacab..f5d24d4 100644
--- a/sysdeps/powerpc/powerpc32/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc32/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
@@ -43,10 +43,17 @@ ENTRY (__strchrnul)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r9,r9,r6
+	slw	r10,r10,r6
+	slw	r9,r9,r6
+#else
 	slw	r10,r10,r6
 	slw	r9,r9,r6
 	srw	r10,r10,r6
 	srw	r9,r9,r6
+#endif
 	or	r5,r9,r10     /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -54,7 +61,7 @@ ENTRY (__strchrnul)
 
 	mtcrf   0x01,r8
 
-	/* Are we now aligned to a quadword boundary?  If so, skip to
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop)
@@ -76,7 +83,7 @@ L(loop):
 	   single register for speed.  This is an attempt
 	   to speed up the null-checking process for bigger strings.  */
 	lwz	r12,4(r8)
-	lwzu     r11,8(r8)
+	lwzu	r11,8(r8)
 	cmpb	r10,r12,r0
 	cmpb	r9,r12,r4
 	cmpb	r6,r11,r0
@@ -95,9 +102,9 @@ L(loop):
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
-	/* The c/null byte must be in the second word.  Adjust the
-	   address again and move the result of cmpb to r10 so we can calculate
-	   the pointer.  */
+	/* The c/null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r5 so we can calculate the
+	   pointer.  */
 	mr	r5,r10
 	addi	r8,r8,4
 
@@ -105,7 +112,13 @@ L(loop):
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc32/strchr.S b/sysdeps/powerpc/powerpc32/strchr.S
index c9952ee..6050565 100644
--- a/sysdeps/powerpc/powerpc32/strchr.S
+++ b/sysdeps/powerpc/powerpc32/strchr.S
@@ -36,6 +36,8 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 
 	rlwimi	rCHR, rCHR, 8, 16, 23
@@ -49,64 +51,93 @@ ENTRY (strchr)
 	addi	r7F7F, r7F7F, 0x7f7f
 /* Test the first (partial?) word.  */
 	lwz	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rIGN
+#else
 	srw	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):lwzu rWORD, 4(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	lwzu	rWORD, 4(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmplw	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 32-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzw	rCLZB, rTMP2
 	cmplw	rWORD, rTMP2
 	bgtlr
-	cntlzw	rCLZB, rTMP2
+#endif
 	srwi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 32-7-32
+	srawi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzw	rCLZB, rTMP2
 	subi	rSTR, rSTR, 4
 	srwi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S
index 3ffe7a1..4679a15 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
 	beq	cr7,L(null_match)
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi  r4,r4,32,0
 
 	/* Now r4 has a doubleword of c bytes and r0 has
@@ -47,11 +47,17 @@ ENTRY (strchr)
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
 	sld	r10,r10,r6
 	sld	r11,r11,r6
 	srd	r10,r10,r6
 	srd	r11,r11,r6
+#endif
 	or	r5,r10,r11    /* OR the results to speed things up.  */
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -108,15 +114,24 @@ L(loop):
 	mr	r11,r7
 	addi	r8,r8,8
 
-	/* r5 has the output of the cmpb instruction, that is, it contains
+	/* r10/r11 have the output of the cmpb instructions, that is,
 	   0xff in the same position as the c/null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done):
-	cntlzd	r4,r10	      /* Count leading zeroes before c matches.  */
-	cntlzd	r0,r11	      /* Count leading zeroes before null matches.  */
-	cmpld	cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
 	bgt	cr7,L(no_match)
-	srdi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching c byte
 				 or null in case c was not found.  */
 	blr
@@ -135,9 +150,13 @@ L(null_match):
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
 	sld	r5,r5,r6
 	srd	r5,r5,r6
+#endif
 	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
 				 have been found.  */
 	bne	cr7,L(done_null)
@@ -192,7 +211,13 @@ L(loop_null):
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
index 9dbc51b..df45752 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi	r4,r4,32,0
 
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
@@ -44,10 +44,17 @@ ENTRY (__strchrnul)
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and to bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+#else
 	sld	r10,r10,r6
 	sld	r9,r9,r6
 	srd	r10,r10,r6
 	srd	r9,r9,r6
+#endif
 	or	r5,r9,r10     /* OR the results to speed things up.  */
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -97,7 +104,7 @@ L(loop):
 	bne	cr6,L(done)
 
 	/* The c/null byte must be in the second doubleword.  Adjust the
-	   address again and move the result of cmpb to r10 so we can calculate
+	   address again and move the result of cmpb to r5 so we can calculate
 	   the pointer.  */
 	mr	r5,r10
 	addi	r8,r8,8
@@ -106,7 +113,13 @@ L(loop):
 	   0xff in the same position as the c/null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc64/strchr.S b/sysdeps/powerpc/powerpc64/strchr.S
index d2d8cd3..da707ae 100644
--- a/sysdeps/powerpc/powerpc64/strchr.S
+++ b/sysdeps/powerpc/powerpc64/strchr.S
@@ -37,11 +37,13 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 	dcbt	0,rRTN
-	rlwimi	rCHR, rCHR, 8, 16, 23
+	insrdi	rCHR, rCHR, 8, 48
 	li	rMASK, -1
-	rlwimi	rCHR, rCHR, 16, 0, 15
+	insrdi	rCHR, rCHR, 16, 32
 	rlwinm	rIGN, rRTN, 3, 26, 28
 	insrdi	rCHR, rCHR, 32, 0
 	lis	rFEFE, -0x101
@@ -54,64 +56,93 @@ ENTRY (strchr)
 	add	rFEFE, rFEFE, rTMP1
 /* Test the first (partial?) word.  */
 	ld	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	sld	rMASK, rMASK, rIGN
+#else
 	srd	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):ldu rWORD, 8(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	ldu	rWORD, 8(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzd	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmpld	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 64-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzd	rCLZB, rTMP2
 	cmpld	rWORD, rTMP2
 	bgtlr
-	cntlzd	rCLZB, rTMP2
+#endif
 	srdi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzd	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 64-7-64
+	sradi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzd	rCLZB, rTMP2
 	subi	rSTR, rSTR, 8
 	srdi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ceef5f29e57c98117682307c39d979ccf7fec5c4

commit ceef5f29e57c98117682307c39d979ccf7fec5c4
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:14:44 2013 -0500

    PowerPC: LE strcmp/strncmp
    
    More little-endian support.  I leave the main strcmp loops unchanged,
    (well, except for renumbering rTMP to something other than r0 since
    it's needed in an addi insn) and modify the tail for little-endian.
    
    I noticed some of the big-endian tail code was a little untidy so have
    cleaned that up too.

diff --git a/sysdeps/powerpc/powerpc32/power4/strncmp.S b/sysdeps/powerpc/powerpc32/power4/strncmp.S
index 724d908..89b961e 100644
--- a/sysdeps/powerpc/powerpc32/power4/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/power4/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,6 +37,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -75,12 +76,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -88,28 +122,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/power7/strncmp.S b/sysdeps/powerpc/powerpc32/power7/strncmp.S
index fdae44d..10c9d25 100644
--- a/sysdeps/powerpc/powerpc32/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -26,7 +26,7 @@
 
 EALIGN (strncmp,5,0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -39,6 +39,7 @@ EALIGN (strncmp,5,0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	nop
@@ -78,13 +79,45 @@ L(g1):	add	rTMP,rFEFE,rWORD1
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
 
+#else
 L(endstring):
 	and	rTMP,r7F7F,rWORD1
 	beq	cr1,L(equal)
 	add	rTMP,rTMP,r7F7F
 	xor.	rBITDIF,rWORD1,rWORD2
-
 	andc	rNEG,rNEG,rTMP
 	blt	L(highbit)
 	cntlzw	rBITDIF,rBITDIF
@@ -92,28 +125,20 @@ L(endstring):
 	addi	rNEG,rNEG,7
 	cmpw	cr1,rNEG,rBITDIF
 	sub	rRTN,rWORD1,rWORD2
-	blt	cr1,L(equal)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr	cr1
 L(equal):
 	li	rRTN,0
 	blr
 
 L(different):
-	lwzu	rWORD1,-4(rSTR1)
+	lwz	rWORD1,-4(rSTR1)
 	xor.	rBITDIF,rWORD1,rWORD2
 	sub	rRTN,rWORD1,rWORD2
-	blt	L(highbit)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr
 L(highbit):
-	srwi	rWORD2,rWORD2,24
-	srwi	rWORD1,rWORD1,24
-	sub	rRTN,rWORD1,rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well. In this case, we just do a byte-by-byte comparison.  */
 	.align	4
diff --git a/sysdeps/powerpc/powerpc32/strcmp.S b/sysdeps/powerpc/powerpc32/strcmp.S
index 297ca3c..91d60c9 100644
--- a/sysdeps/powerpc/powerpc32/strcmp.S
+++ b/sysdeps/powerpc/powerpc32/strcmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strcmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -34,6 +34,7 @@ EALIGN (strcmp, 4, 0)
 #define r7F7F	r8	/* constant 0x7f7f7f7f */
 #define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r10	/* bits that differ in s1 & s2 words */
+#define rTMP	r11
 
 
 	or	rTMP, rSTR2, rSTR1
@@ -56,10 +57,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
 	and.	rTMP, rTMP, rNEG
 	cmpw	cr1, rWORD1, rWORD2
 	beq+	L(g0)
-L(endstring):
+
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	rlwimi	rTMP2, rTMP2, 1, 0, 30
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
+L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
@@ -84,7 +120,7 @@ L(different):
 L(highbit):
 	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/strncmp.S b/sysdeps/powerpc/powerpc32/strncmp.S
index fa345d2..e36a160 100644
--- a/sysdeps/powerpc/powerpc32/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -73,12 +74,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -86,28 +120,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/power4/strncmp.S b/sysdeps/powerpc/powerpc64/power4/strncmp.S
index 1276e16..5d136cf 100644
--- a/sysdeps/powerpc/powerpc64/power4/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power4/strncmp.S
@@ -25,7 +25,7 @@
 EALIGN (strncmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -38,6 +38,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -79,12 +80,59 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -93,7 +141,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -101,7 +149,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1, -8(rSTR1)
+	ld	rWORD1, -8(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
 	blt-	L(highbit)
@@ -109,11 +157,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
index 77ecad5..e618b01 100644
--- a/sysdeps/powerpc/powerpc64/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -27,7 +27,7 @@
 EALIGN (strncmp,5,0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -40,6 +40,7 @@ EALIGN (strncmp,5,0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	nop
@@ -83,12 +84,57 @@ L(g1):	add	rTMP,rFEFE,rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP,r7F7F,rWORD1
 	beq	cr1,L(equal)
 	add	rTMP,rTMP,r7F7F
 	xor.	rBITDIF,rWORD1,rWORD2
-
 	andc	rNEG,rNEG,rTMP
 	blt	L(highbit)
 	cntlzd	rBITDIF,rBITDIF
@@ -97,7 +143,7 @@ L(endstring):
 	cmpd	cr1,rNEG,rBITDIF
 	sub	rRTN,rWORD1,rWORD2
 	blt	cr1,L(equal)
-	sradi	rRTN,rRTN,63
+	sradi	rRTN,rRTN,63		/* must return an int.  */
 	ori	rRTN,rRTN,1
 	blr
 L(equal):
@@ -105,7 +151,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1,-8(rSTR1)
+	ld	rWORD1,-8(rSTR1)
 	xor.	rBITDIF,rWORD1,rWORD2
 	sub	rRTN,rWORD1,rWORD2
 	blt	L(highbit)
@@ -113,11 +159,10 @@ L(different):
 	ori	rRTN,rRTN,1
 	blr
 L(highbit):
-	srdi	rWORD2,rWORD2,56
-	srdi	rWORD1,rWORD1,56
-	sub	rRTN,rWORD1,rWORD2
+	sradi	rRTN,rWORD2,63
+	ori	rRTN,rRTN,1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align	4
diff --git a/sysdeps/powerpc/powerpc64/strcmp.S b/sysdeps/powerpc/powerpc64/strcmp.S
index c9d6dac..7085468 100644
--- a/sysdeps/powerpc/powerpc64/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/strcmp.S
@@ -25,7 +25,7 @@
 EALIGN (strcmp, 4, 0)
 	CALL_MCOUNT 2
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strcmp, 4, 0)
 #define r7F7F	r8	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r10	/* bits that differ in s1 & s2 words */
+#define rTMP	r11
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -58,19 +59,66 @@ L(g0):	ldu	rWORD1, 8(rSTR1)
 	ldu	rWORD2, 8(rSTR2)
 L(g1):	add	rTMP, rFEFE, rWORD1
 	nor	rNEG, r7F7F, rWORD1
-
 	and.	rTMP, rTMP, rNEG
 	cmpd	cr1, rWORD1, rWORD2
 	beq+	L(g0)
-L(endstring):
+
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
+L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -79,7 +127,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -95,11 +143,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/strncmp.S b/sysdeps/powerpc/powerpc64/strncmp.S
index 779d9f7..8f842c4 100644
--- a/sysdeps/powerpc/powerpc64/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/strncmp.S
@@ -25,7 +25,7 @@
 EALIGN (strncmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -36,6 +36,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -77,12 +78,59 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -91,7 +139,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -99,7 +147,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1, -8(rSTR1)
+	ld	rWORD1, -8(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
 	blt-	L(highbit)
@@ -107,11 +155,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4ed29a55ac3a658d9d092c48dc7c7041ccc23962

commit 4ed29a55ac3a658d9d092c48dc7c7041ccc23962
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:13:55 2013 -0500

    PowerPC: LE strnlen
    
    The existing strnlen code has a number of defects, so this patch is more
    than just adding little-endian support.  The changes here are similar to
    those for memchr.

diff --git a/sysdeps/powerpc/powerpc32/power7/strnlen.S b/sysdeps/powerpc/powerpc32/power7/strnlen.S
index ed08836..eb52afd 100644
--- a/sysdeps/powerpc/powerpc32/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc32/power7/strnlen.S
@@ -28,51 +28,47 @@ ENTRY (__strnlen)
 	add	r7,r3,r4      /* Calculate the last acceptable address.  */
 	cmplwi	r4,16
 	li	r0,0	      /* Word with null chars.  */
+	addi	r7,r7,-1
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  If
-				 it's equal or less, it means size is either 0
-				 or a negative number.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
+	clrrwi	r7,r7,2       /* Address of last word.  */
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop_setup)
 
-	/* Handle DWORD2 of pair.  */
+	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
 L(loop_setup):
-	sub	r5,r7,r9
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r5,r7,r8
 	srwi	r6,r5,3	      /* Number of loop iterations.  */
 	mtctr	r6	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for the null byte backwards in the string.  Since
+
+	/* Main loop to look for the null byte in the string.  Since
 	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
 	.p2align  5
 L(loop):
@@ -88,15 +84,18 @@ L(loop):
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for null in the whole range.  Just return
-	   the original size.  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	blt	cr6,L(loop_small)
+
+	/* We may have one more word to read.  */
+	cmplw	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr6,r10,0
+	bne	cr6,L(done)
 
 L(end_max):
-	sub	r3,r7,r3
+	mr	r3,r4
 	blr
 
 	/* OK, one (or both) of the words contains a null byte.  Check
@@ -121,49 +120,56 @@ L(found):
 	   We need to make sure the null char is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
-	add	r9,r8,r0
-	sub	r6,r9,r3      /* Length until the match.  */
-	cmplw	r9,r7
-	bgt	L(end_max)
-	mr	r3,r6
-	blr
-
-	.align	4
-L(zero):
-	li	r3,0
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmplw	r3,r4
+	blelr
+	mr	r3,r4
 	blr
 
-/* Deals with size <= 32.  */
+/* Deals with size <= 16.  */
 	.align	4
 L(small_range):
 	cmplwi	r4,0
-	beq	L(zero)
+	beq	L(end_max)
+
+	clrrwi	r7,r7,2       /* Address of last word.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in WORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	beq	L(end_max)
 
 	.p2align  5
 L(loop_small):
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
-	addi	r9,r8,4
 	cmplwi	cr6,r10,0
 	bne	cr6,L(done)
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
 END (__strnlen)
 weak_alias (__strnlen, strnlen)
 libc_hidden_builtin_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S
index 37c7dbf..5159106 100644
--- a/sysdeps/powerpc/powerpc64/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -24,33 +24,29 @@
 ENTRY (__strnlen)
 	CALL_MCOUNT 2
 	dcbt	0,r3
-	clrrdi  r8,r3,3
+	clrrdi	r8,r3,3
 	add	r7,r3,r4      /* Calculate the last acceptable address.  */
 	cmpldi	r4,32
 	li	r0,0	      /* Doubleword with null chars.  */
+	addi	r7,r7,-1
+
 	/* If we have less than 33 bytes to search, skip to a faster code.  */
 	ble	L(small_range)
 
-	cmpld	cr7,r3,r7    /* Is the address equal or less than r3?  If
-				it's equal or less, it means size is either 0
-				or a negative number.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
 	sld	r10,r10,r6
 	srd	r10,r10,r6
+#endif
 	cmpldi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(end_max)
-
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
 	mtcrf   0x01,r8
 	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
@@ -63,17 +59,18 @@ L(proceed):
 	cmpldi	cr7,r10,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(end_max)
-
 L(loop_setup):
-	sub	r5,r7,r9
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r5,r7,r8
 	srdi	r6,r5,4	      /* Number of loop iterations.  */
 	mtctr	r6	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for the null byte backwards in the string.  Since
+
+	/* Main loop to look for the null byte in the string.  Since
 	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
 	.p2align  5
 L(loop):
@@ -89,15 +86,18 @@ L(loop):
 	cmpldi	cr7,r5,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for null in the whole range.  Just return
-	   the original size.  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	blt	cr6,L(loop_small)
+
+	/* We may have one more dword to read.  */
+	cmpld	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
 
 L(end_max):
-	sub	r3,r7,r3
+	mr	r3,r4
 	blr
 
 	/* OK, one (or both) of the doublewords contains a null byte.  Check
@@ -119,52 +119,59 @@ L(found):
 	/* r10 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the length.
-	   We need to make sure the null char is *before* the start of the
-	   range (since we're going backwards).  */
+	   We need to make sure the null char is *before* the end of the
+	   range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
-	add	r9,r8,r0
-	sub	r6,r9,r3      /* Length until the match.  */
-	cmpld	r9,r7
-	bgt	L(end_max)
-	mr	r3,r6
-	blr
-
-	.align	4
-L(zero):
-	li	r3,0
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4
+	blelr
+	mr	r3,r4
 	blr
 
 /* Deals with size <= 32.  */
 	.align	4
 L(small_range):
 	cmpldi	r4,0
-	beq	L(zero)
+	beq	L(end_max)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
 
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	ld	r12,0(r8)     /* Load word from memory.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
 	sld	r10,r10,r6
 	srd	r10,r10,r6
+#endif
 	cmpldi	cr7,r10,0
 	bne	cr7,L(done)
 
-	addi    r9,r8,8
-	cmpld	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmpld	r8,r7
+	beq	L(end_max)
 
 	.p2align  5
 L(loop_small):
 	ldu	r12,8(r8)
 	cmpb	r10,r12,r0
-	addi	r9,r8,8
 	cmpldi	cr6,r10,0
 	bne	cr6,L(done)
-	cmpld	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmpld	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
 END (__strnlen)
 weak_alias (__strnlen, strnlen)
 libc_hidden_builtin_def (strnlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5137727bf9e2db0821c28766172621b5d79074da

commit 5137727bf9e2db0821c28766172621b5d79074da
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:11:56 2013 -0500

    PowerPC: LE memcmp
    
    This is a rather large patch due to formatting and renaming.  The
    formatting changes were to make it possible to compare power7 and
    power4 versions of memcmp.  Using different register defines came
    about while I was wrestling with the code, trying to find spare
    registers at one stage.  I found it much simpler if we refer to a reg
    by the same name throughout a function, so it's better if short-term
    multiple use regs like rTMP are referred to using their register
    number.  I made the cr field usage changes when attempting to reload
    rWORDn regs in the exit path to byte swap before comparing when
    little-endian.  That proved a bad idea due to the pipelining involved
    in the main loop;  Offsets to reload the regs were different first
    time around the loop..  Anyway, I left the cr field usage changes in
    place for consistency.
    
    Aside from these more-or-less cosmetic changes, I fixed a number of
    places where an early exit path restores regs unnecessarily, removed
    some dead code, and optimised one or two exits.

diff --git a/sysdeps/powerpc/powerpc32/power4/memcmp.S b/sysdeps/powerpc/powerpc32/power4/memcmp.S
index 9a455a3..35e1626 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcmp.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized strcmp implementation for PowerPC32.
    Copyright (C) 2003-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,13 +18,14 @@
 
 #include <sysdep.h>
 
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
 
 	.machine power4
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,33 +36,32 @@ EALIGN (memcmp, 4, 0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP, rSTR2, rSTR1
+	xor	r0, rSTR2, rSTR1
 	cmplwi	cr6, rN, 0
 	cmplwi	cr1, rN, 12
-	clrlwi.	rTMP, rTMP, 30
-	clrlwi	rBITDIF, rSTR1, 30
-	cmplwi	cr5, rBITDIF, 0
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
 	beq-	cr6, L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-        stwu    1,-64(1)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-        stw     r31,48(1)
-	cfi_offset(31,(48-64))
-        stw     r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -70,74 +70,95 @@ EALIGN (memcmp, 4, 0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
-	.align 4
+	.align	4
 L(samealignment):
 	clrrwi	rSTR1, rSTR1, 2
 	clrrwi	rSTR2, rSTR2, 2
 	beq	cr5, L(Waligned)
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(dPs3)
 	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
-	.align 3
+	.align	3
 L(dsP1):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dPs2):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(dPs3):
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD2, r11
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(dPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD2, r11
-	cmplw	cr0, rWORD1, rWORD2
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Waligned):
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	cmplwi	cr1, rBITDIF, 8
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dP4)
@@ -145,177 +166,352 @@ L(Waligned):
 	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dP1):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	bne	cr0, L(dLcr0)
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
 
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne	cr1, L(dLcr1)
 	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-	.align 3
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	.align	3
 L(dP1x):
 	slwi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
+	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr5, rWORD3, rWORD4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	slwi.	r12, rN, 3
-	bne	cr6, L(dLcr6)
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
-	bne	cr5, L(dLcr5)
+#endif
+	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
 	lwz	rWORD4, 0(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr5, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	slwi.	r12, rN, 3
-	bne	cr1, L(dLcr1)
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	bne	cr6, L(dLcr6)
+#endif
+	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5, L(dLcr5)
-        lwz     1,0(1)
+	bne	cr7, L(dLcr7x)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne-	cr1, L(dLcr1)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz+	L(dLoop)
 
 L(dL4):
@@ -325,7 +521,7 @@ L(dL4):
 	bne	cr5, L(dLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
 	bne	cr1, L(dLcr1)
 L(d24):
@@ -334,69 +530,82 @@ L(d14):
 	slwi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-        lwz     1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beqlr
-        li      rRTN,1
-        bgtlr
-        li      rRTN,-1
-        blr
-
-	.align 4
-L(dLcr0):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
 	li	rRTN, 1
-        lwz     1,0(1)
-	bgtlr	cr0
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
 	li	rRTN, -1
 	blr
 
-	.align 4
+	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -411,7 +620,7 @@ L(bytealigned):
 	lbz	rWORD1, 0(rSTR1)
 	lbz	rWORD2, 0(rSTR2)
 	bdz-	L(b11)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	lbz	rWORD3, 1(rSTR1)
 	lbz	rWORD4, 1(rSTR2)
 	bdz-	L(b12)
@@ -419,11 +628,11 @@ L(bytealigned):
 	lbzu	rWORD5, 2(rSTR1)
 	lbzu	rWORD6, 2(rSTR2)
 	bdz-	L(b13)
-	.align 4
+	.align	4
 L(bLoop):
 	lbzu	rWORD1, 1(rSTR1)
 	lbzu	rWORD2, 1(rSTR2)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 
 	cmplw	cr6, rWORD5, rWORD6
 	bdz-	L(b3i)
@@ -432,7 +641,7 @@ L(bLoop):
 	lbzu	rWORD4, 1(rSTR2)
 	bne-	cr1, L(bLcr1)
 
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz-	L(b2i)
 
 	lbzu	rWORD5, 1(rSTR1)
@@ -449,23 +658,23 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	bne-	cr1, L(bLcr1)
 	b	L(bx56)
-	.align 4
+	.align	4
 L(b2i):
 	bne-	cr6, L(bLcr6)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	b	L(bx34)
-	.align 4
+	.align	4
 L(b3i):
 	bne-	cr1, L(bLcr1)
 	bne-	cr6, L(bLcr6)
 	b	L(bx12)
-	.align 4
-L(bLcr0):
+	.align	4
+L(bLcr7):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
 L(bLcr1):
@@ -480,36 +689,31 @@ L(bLcr6):
 	blr
 
 L(b13):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 	bne-	cr1, L(bx34)
 L(bx56):
 	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 L(bx34):
 	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
-
-	.align 4
-L(zeroLengthReturn):
-
+	.align	4
 L(zeroLength):
 	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
-	.align 4
+	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not already word aligned yet.
@@ -518,79 +722,88 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw     r29,40(r1)
-	cfi_offset(r29,(40-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
 	clrlwi	rSHL, rSTR2, 30
-        stw     r28,36(r1)
-	cfi_offset(r28,(36-64))
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
 	beq	cr5, L(Wunaligned)
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27, rSTR2, rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL, r27, 30
+	clrlwi	rSHL, rWORD8_SHIFT, 30
 	clrrwi	rSTR1, rSTR1, 2
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
 	slwi	rSHL, rSHL, 3
-	cmplw	cr5, r27, rSTR2
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	subfic	rSHR, rSHL, 32
-	srwi	rTMP, rN, 4      /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD8, 0(rSTR2)
-	la	rSTR2, 4(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
 	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
-	srw	rG, rWORD2, rSHR
+	srw	r12, rWORD2, rSHR
 	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
 	bgt	cr1, L(duPs3)
 	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dusP1):
-	slw	rB, rWORD2, rSHL
-	slw	rWORD7, rWORD1, r11
-	slw	rWORD8, rWORD8, r11
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
 	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
@@ -600,95 +813,133 @@ L(dusP1):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duPs2):
-	slw	rH, rWORD2, rSHL
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD8, r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duPs3):
-	slw	rF, rWORD2, rSHL
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD8, r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
-	slw	rD, rWORD2, rSHL
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD8, r11
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Wunaligned):
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD6, 0(rSTR2)
 	lwzu	rWORD8, 4(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	subfic	rSHR, rSHL, 32
-	slw	rH, rWORD6, rSHL
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(duP3)
 	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(duP1):
-	srw	rG, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD7, 0(rSTR1)
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
-	or	rWORD4, rC, rD
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	bne	cr0, L(duLcr0)
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
 	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
-	.align 4
+	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
@@ -698,186 +949,321 @@ L(duP1x):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
-	ld	rWORD2, 8(rSTR2)
-	srw	rA, rWORD2, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duP2):
-	srw	rE, rWORD8, rSHR
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
-	or	rWORD6, rE, rH
-	slw	rH, rWORD8, rSHL
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
-	.align 4
+	.align	4
 L(duP2x):
 	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duP3):
-	srw	rC, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
-	slw	rF, rWORD8, rSHL
-	or	rWORD4, rC, rH
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
-	.align 4
+	.align	4
 L(duP3x):
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	srw	rA, rWORD8, rSHR
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
-	slw	rD, rWORD8, rSHL
-	or	rWORD2, rA, rH
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	cmplw	cr5, rWORD7, rWORD8
 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne-	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz+	L(duLoop)
 
 L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0, L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
 	bne	cr1, L(duLcr1)
 L(du24):
@@ -887,95 +1273,101 @@ L(du14):
 	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
+   rWORD8_SHIFT).  */
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
-	.align 4
+#endif
+	srw	r0, rWORD2, rSHR
+	.align	4
 L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
 	lwz	rWORD1, 4(rSTR1)
-        lwz     r31,48(1)
+#endif
+	lwz	rWORD8, 48(r1)
 	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2, rA, rB
-        lwz     r30,44(1)
-        lwz     r29,40(r1)
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        lwz     r28,36(r1)
-        lwz     r27,32(r1)
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beq     L(dureturn26)
-        li      rRTN,1
-        bgt     L(dureturn26)
-        li      rRTN,-1
-	b    L(dureturn26)
-	.align 4
-L(duLcr0):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
+	b	L(dureturn26)
+	.align	4
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
-	bgt	cr0, L(dureturn29)
-	lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr1):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr6):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr5):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-        lwz     r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-        lwz     r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-        lwz     r25,24(r1)
-        lwz     r24,20(r1)
-        lwz     1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
 
diff --git a/sysdeps/powerpc/powerpc32/power7/memcmp.S b/sysdeps/powerpc/powerpc32/power7/memcmp.S
index 075e19f..f160dde 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -23,10 +23,9 @@
 		    size_t size [r5])  */
 
 	.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,35 +36,32 @@ EALIGN (memcmp,4,0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP,rSTR2,rSTR1
-	cmplwi	cr6,rN,0
-	cmplwi	cr1,rN,12
-	clrlwi.	rTMP,rTMP,30
-	clrlwi	rBITDIF,rSTR1,30
-	cmplwi	cr5,rBITDIF,0
-	beq-	cr6,L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
-
-	/* If less than 8 bytes or not aligned, use the unaligned
-	   byte loop.  */
-
-	blt	cr1,L(bytealigned)
-	stwu	1,-64(1)
+	xor	r0, rSTR2, rSTR1
+	cmplwi	cr6, rN, 0
+	cmplwi	cr1, rN, 12
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-	stw	r31,48(1)
-	cfi_offset(31,(48-64))
-	stw	r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -74,332 +70,541 @@ EALIGN (memcmp,4,0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
 	.align	4
 L(samealignment):
-	clrrwi	rSTR1,rSTR1,2
-	clrrwi	rSTR2,rSTR2,2
-	beq	cr5,L(Waligned)
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	clrrwi	rSTR1, rSTR1, 2
+	clrrwi	rSTR2, rSTR2, 2
+	beq	cr5, L(Waligned)
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr	rTMP
-	bgt	cr1,L(dPs3)
-	beq	cr1,L(dPs2)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
 	.align	3
 L(dsP1):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
 	.align	4
 L(dPs2):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
 	.align	4
 L(dPs3):
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD2,r11
-	cmplw	cr1,rWORD3,rWORD4
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(dPs4):
-	mtctr	rTMP
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD2,r11
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Waligned):
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dP4)
-	bgt	cr1,L(dP3)
-	beq	cr1,L(dP2)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
 	.align	4
 L(dP1):
-	mtctr	rTMP
+	mtctr	r0
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	bne	cr0,L(dLcr0)
-
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
-	bne	cr6,L(dLcr6)
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	bne	cr6, L(dLcr6)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 	.align	3
 L(dP1x):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr	rTMP
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(dLcr6)
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP2x):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr5,rWORD3,rWORD4
-	slwi.	r12,rN,3
-	bne	cr6,L(dLcr6)
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	slwi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr	rTMP
-	lwz	rWORD3,0(rSTR1)
-	lwz	rWORD4,0(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+	lwz	rWORD4, 0(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP3x)
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(dLcr1)
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP3x):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD1,rWORD2
-	slwi.	r12,rN,3
-	bne	cr1,L(dLcr1)
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(dLcr6)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5,L(dLcr5)
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	slwi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr	rTMP
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
-	bne	cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
 L(dLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
 L(dLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
 L(dLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
 
 L(dL4):
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0,L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
-	bne	cr1,L(dLcr1)
+	bne	cr1, L(dLcr1)
 L(d24):
-	bne	cr6,L(dLcr6)
+	bne	cr6, L(dLcr6)
 L(d14):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
 L(d04):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	lwz	1,0(1)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beqlr
-	li	rRTN,1
-	bgtlr
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	sub	rRTN, rWORD1, rWORD2
 	blr
 
 	.align	4
-L(dLcr0):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
-	bgtlr	cr0
-	li	rRTN,-1
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
-	li	rRTN,1
-	lwz	1,0(1)
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
 	mtctr	rN
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
@@ -411,38 +616,39 @@ L(bytealigned):
 
    So we must precondition some registers and condition codes so that
    we don't exit the loop early on the first iteration.  */
-	lbz	rWORD1,0(rSTR1)
-	lbz	rWORD2,0(rSTR2)
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
 	bdz	L(b11)
-	cmplw	cr0,rWORD1,rWORD2
-	lbz	rWORD3,1(rSTR1)
-	lbz	rWORD4,1(rSTR2)
+	cmplw	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
 	bdz	L(b12)
-	cmplw	cr1,rWORD3,rWORD4
-	lbzu	rWORD5,2(rSTR1)
-	lbzu	rWORD6,2(rSTR2)
+	cmplw	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
 	bdz	L(b13)
 	.align	4
 L(bLoop):
-	lbzu	rWORD1,1(rSTR1)
-	lbzu	rWORD2,1(rSTR2)
-	bne	cr0,L(bLcr0)
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
 
-	cmplw	cr6,rWORD5,rWORD6
+	cmplw	cr6, rWORD5, rWORD6
 	bdz	L(b3i)
 
-	lbzu	rWORD3,1(rSTR1)
-	lbzu	rWORD4,1(rSTR2)
-	bne	cr1,L(bLcr1)
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
 
-	cmplw	cr0,rWORD1,rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz	L(b2i)
 
-	lbzu	rWORD5,1(rSTR1)
-	lbzu	rWORD6,1(rSTR2)
-	bne	cr6,L(bLcr6)
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
 
-	cmplw	cr1,rWORD3,rWORD4
+	cmplw	cr1, rWORD3, rWORD4
 	bdnz	L(bLoop)
 
 /* We speculatively loading bytes before we have tested the previous
@@ -452,67 +658,62 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne	cr0,L(bLcr0)
-	bne	cr1,L(bLcr1)
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
 	b	L(bx56)
 	.align	4
 L(b2i):
-	bne	cr6,L(bLcr6)
-	bne	cr0,L(bLcr0)
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
 	b	L(bx34)
 	.align	4
 L(b3i):
-	bne	cr1,L(bLcr1)
-	bne	cr6,L(bLcr6)
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
 	b	L(bx12)
 	.align	4
-L(bLcr0):
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 L(bLcr1):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 L(bLcr6):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 L(b13):
-	bne	cr0,L(bx12)
-	bne	cr1,L(bx34)
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
 L(bx56):
-	sub	rRTN,rWORD5,rWORD6
+	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne	cr0,L(bx12)
+	bne	cr7, L(bx12)
 L(bx34):
-	sub	rRTN,rWORD3,rWORD4
+	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
-	sub	rRTN,rWORD1,rWORD2
+	sub	rRTN, rWORD1, rWORD2
 	blr
-
 	.align	4
-L(zeroLengthReturn):
-
 L(zeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
 	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not already word aligned yet.
@@ -521,465 +722,654 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw	r29,40(r1)
-	cfi_offset(r29,(40-64))
-	clrlwi	rSHL,rSTR2,30
-	stw	r28,36(r1)
-	cfi_offset(r28,(36-64))
-	beq	cr5,L(Wunaligned)
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
+	clrlwi	rSHL, rSTR2, 30
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
+	beq	cr5, L(Wunaligned)
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27,rSTR2,rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL,r27,30
-	clrrwi	rSTR1,rSTR1,2
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	slwi	rSHL,rSHL,3
-	cmplw	cr5,r27,rSTR2
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	stw	r24,20(r1)
-	cfi_offset(r24,(20-64))
-	subfic	rSHR,rSHL,32
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
+	clrlwi	rSHL, rWORD8_SHIFT, 30
+	clrrwi	rSTR1, rSTR1, 2
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	slwi	rSHL, rSHL, 3
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	subfic	rSHR, rSHL, 32
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
-	li	rWORD8,0
-	blt	cr5,L(dus0)
-	lwz	rWORD8,0(rSTR2)
-	la	rSTR2,4(rSTR2)
-	slw	rWORD8,rWORD8,rSHL
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
+	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	srw	rG,rWORD2,rSHR
-	clrlwi	rN,rN,30
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	srw	r12, rWORD2, rSHR
+	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	bgt	cr1,L(duPs3)
-	beq	cr1,L(duPs2)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
 	.align	4
 L(dusP1):
-	slw	rB,rWORD2,rSHL
-	slw	rWORD7,rWORD1,r11
-	slw	rWORD8,rWORD8,r11
-	bge	cr7,L(duP1e)
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duPs2):
-	slw	rH,rWORD2,rSHL
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD8,r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
 	.align	4
 L(duPs3):
-	slw	rF,rWORD2,rSHL
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD8,r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duPs4):
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	slw	rD,rWORD2,rSHL
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD8,r11
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Wunaligned):
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
-	stw	r24,20(r1)
-	cfi_offset(r24,(24-64))
-	slwi	rSHL,rSHL,3
-	lwz	rWORD6,0(rSTR2)
-	lwzu	rWORD8,4(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
-	subfic	rSHR,rSHL,32
-	slw	rH,rWORD6,rSHL
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD6, 0(rSTR2)
+	lwzu	rWORD8, 4(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	subfic	rSHR, rSHL, 32
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr	rTMP
-	bgt	cr1,L(duP3)
-	beq	cr1,L(duP2)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
 	.align	4
 L(duP1):
-	srw	rG,rWORD8,rSHR
-	lwz	rWORD7,0(rSTR1)
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP1x)
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD7, 0(rSTR1)
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
 L(duP1e):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	bne	cr5,L(duLcr5)
-	or	rWORD4,rC,rD
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	bne	cr0,L(duLcr0)
-	or	rWORD6,rE,rF
-	cmplw	cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
 	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
 L(duP1x):
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duP2):
-	srw	rE,rWORD8,rSHR
-	lwz	rWORD5,0(rSTR1)
-	or	rWORD6,rE,rH
-	slw	rH,rWORD8,rSHL
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP2x)
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	cmplw	cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
-	cmplw	cr5,rWORD7,rWORD8
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
 	.align	4
 L(duP3):
-	srw	rC,rWORD8,rSHR
-	lwz	rWORD3,0(rSTR1)
-	slw	rF,rWORD8,rSHL
-	or	rWORD4,rC,rH
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP3x)
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(duLcr1)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duP4):
-	mtctr	rTMP
-	srw	rA,rWORD8,rSHR
-	lwz	rWORD1,0(rSTR1)
-	slw	rD,rWORD8,rSHL
-	or	rWORD2,rA,rH
+	mtctr	r0
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmplw	cr5, rWORD7, rWORD8
 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz	L(duLoop)
 
 L(duL4):
-	bne	cr1,L(duLcr1)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0,L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
-	bne	cr1,L(duLcr1)
+	bne	cr1, L(duLcr1)
 L(du24):
-	bne	cr6,L(duLcr6)
+	bne	cr6, L(duLcr6)
 L(du14):
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
-	cmplw	cr7,rN,rSHR
+   rWORD8_SHIFT).  */
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-	lwz	rWORD1,4(rSTR1)
-	lwz	r31,48(1)
-	subfic	rN,rN,32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2,rA,rB
-	lwz	r30,44(1)
-	lwz	r29,40(r1)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	lwz	r28,36(r1)
-	lwz	r27,32(r1)
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beq	L(dureturn26)
-	li	rRTN,1
-	bgt	L(dureturn26)
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
+	lwz	rWORD1, 4(rSTR1)
+#endif
+	lwz	rWORD8, 48(r1)
+	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
 	b	L(dureturn26)
 	.align	4
-L(duLcr0):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr0,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr1,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr6,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr5,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-	lwz	r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-	lwz	r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-	lwz	r25,24(r1)
-	lwz	r24,20(r1)
-	lwz	1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
+
 libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcmp.S b/sysdeps/powerpc/powerpc64/power4/memcmp.S
index 69caedc..80d67c9 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized memcmp implementation for PowerPC64.
    Copyright (C) 2003-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,13 +18,14 @@
 
 #include <sysdep.h>
 
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
 
 	.machine power4
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,107 +36,127 @@ EALIGN (memcmp, 4, 0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP, rSTR2, rSTR1
+	xor	r0, rSTR2, rSTR1
 	cmpldi	cr6, rN, 0
 	cmpldi	cr1, rN, 12
-	clrldi.	rTMP, rTMP, 61
-	clrldi	rBITDIF, rSTR1, 61
-	cmpldi	cr5, rBITDIF, 0
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
 	beq-	cr6, L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-	std	rWORD8,-8(r1)
-	cfi_offset(rWORD8,-8)
-	std	rWORD7,-16(r1)
-	cfi_offset(rWORD7,-16)
+	std	rWORD8, -8(r1)
+	cfi_offset(rWORD8, -8)
+	std	rWORD7, -16(r1)
+	cfi_offset(rWORD7, -16)
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
-   aligned and can perform the DWaligned loop.
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
-   yet DW).  So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
-   normal (DWaligned) compare loop, starting at the second double word,
+   normal (DW aligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
-   correct and the first DW (shifted) is in the expected resister pair.  */
-	.align 4
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
 L(samealignment):
 	clrrdi	rSTR1, rSTR1, 3
 	clrrdi	rSTR2, rSTR2, 3
 	beq	cr5, L(DWaligned)
-	add	rN, rN, rBITDIF
-	sldi	r11, rBITDIF, 3
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	beq	L(dPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(dPs3)
 	beq	cr1, L(dPs2)
 
 /* Remainder is 8 */
-	.align 3
+	.align	3
 L(dsP1):
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD2, r11
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(dPs2):
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD2, r11
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(dPs3):
-	sld	rWORD3, rWORD1, r11
-	sld	rWORD4, rWORD2, r11
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(dPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	sld	rWORD1, rWORD1, r11
-	sld	rWORD2, rWORD2, r11
-	cmpld	cr0, rWORD1, rWORD2
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are double word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(DWaligned):
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	cmpldi	cr1, rBITDIF, 16
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	beq	L(dP4)
@@ -143,174 +164,343 @@ L(DWaligned):
 	beq	cr1, L(dP2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dP1):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 0(rSTR1)
 	ld	rWORD6, 0(rSTR2)
+#endif
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	bne	cr0, L(dLcr0)
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
 
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
+#endif
 	bne	cr1, L(dLcr1)
 	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	.align 3
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	.align	3
 L(dP1x):
 	sldi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
+	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(dP2):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 0(rSTR1)
 	ld	rWORD6, 0(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 24(rSTR1)
 	ld	rWORD4, 24(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
-	cmpld	cr5, rWORD3, rWORD4
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 	sldi.	r12, rN, 3
-	bne	cr6, L(dLcr6)
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	bne	cr5, L(dLcr5)
+#endif
+	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(dP3):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 0(rSTR1)
 	ld	rWORD4, 0(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 8(rSTR1)
 	ld	rWORD6, 8(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 16(rSTR1)
 	ld	rWORD8, 16(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 24(rSTR1)
 	ld	rWORD2, 24(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
+#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
-	cmpld	cr5, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	sldi.	r12, rN, 3
-	bne	cr1, L(dLcr1)
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-	bne	cr6, L(dLcr6)
+#endif
+	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	bne	cr5, L(dLcr5)
+	bne	cr7, L(dLcr7x)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(dP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 16(rSTR1)
 	ld	rWORD6, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 24(rSTR1)
 	ldu	rWORD8, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
+#endif
 	bne-	cr1, L(dLcr1)
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdnz+	L(dLoop)
 
 L(dL4):
@@ -320,7 +510,7 @@ L(dL4):
 	bne	cr5, L(dLcr5)
 	cmpld	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
 	bne	cr1, L(dLcr1)
 L(d24):
@@ -329,60 +519,74 @@ L(d14):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	cmpld	cr5, rWORD1, rWORD2
- 	bne	cr5, L(dLcr5x)
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
 	li	rRTN, 0
 	blr
-	.align 4
-L(dLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr7x):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr1x):
 	li	rRTN, 1
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr6x):
 	li	rRTN, 1
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dLcr5x):
 	li	rRTN, 1
 	bgtlr	cr5
 	li	rRTN, -1
 	blr
 
-	.align 4
+	.align	4
 L(bytealigned):
-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
+#if 0
+/* Huh?  We've already branched on cr6!  */
 	beq-	cr6, L(zeroLength)
+#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -397,7 +601,7 @@ L(bytealigned):
 	lbz	rWORD1, 0(rSTR1)
 	lbz	rWORD2, 0(rSTR2)
 	bdz-	L(b11)
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	lbz	rWORD3, 1(rSTR1)
 	lbz	rWORD4, 1(rSTR2)
 	bdz-	L(b12)
@@ -405,11 +609,11 @@ L(bytealigned):
 	lbzu	rWORD5, 2(rSTR1)
 	lbzu	rWORD6, 2(rSTR2)
 	bdz-	L(b13)
-	.align 4
+	.align	4
 L(bLoop):
 	lbzu	rWORD1, 1(rSTR1)
 	lbzu	rWORD2, 1(rSTR2)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 
 	cmpld	cr6, rWORD5, rWORD6
 	bdz-	L(b3i)
@@ -418,7 +622,7 @@ L(bLoop):
 	lbzu	rWORD4, 1(rSTR2)
 	bne-	cr1, L(bLcr1)
 
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdz-	L(b2i)
 
 	lbzu	rWORD5, 1(rSTR1)
@@ -435,23 +639,23 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	bne-	cr1, L(bLcr1)
 	b	L(bx56)
-	.align 4
+	.align	4
 L(b2i):
 	bne-	cr6, L(bLcr6)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	b	L(bx34)
-	.align 4
+	.align	4
 L(b3i):
 	bne-	cr1, L(bLcr1)
 	bne-	cr6, L(bLcr6)
 	b	L(bx12)
-	.align 4
-L(bLcr0):
+	.align	4
+L(bLcr7):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
 L(bLcr1):
@@ -466,14 +670,14 @@ L(bLcr6):
 	blr
 
 L(b13):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 	bne-	cr1, L(bx34)
 L(bx56):
 	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 L(bx34):
 	sub	rRTN, rWORD3, rWORD4
 	blr
@@ -481,101 +685,106 @@ L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
-	.align 4
-L(zeroLengthReturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	.align	4
 L(zeroLength):
 	li	rRTN, 0
 	blr
 
-	.align 4
+	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
    aligned and can perform the DWunaligned loop.
 
    Otherwise we know that rSTR1 is not already DW aligned yet.
    So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
    normal (DWaligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
+   versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL	r29	/* Unaligned shift left count.  */
-#define rSHR	r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	r29,-24(r1)
-	cfi_offset(r29,-24)
+	std	rSHL, -24(r1)
+	cfi_offset(rSHL, -24)
 	clrldi	rSHL, rSTR2, 61
 	beq-	cr6, L(duzeroLength)
-	std	r28,-32(r1)
-	cfi_offset(r28,-32)
+	std	rSHR, -32(r1)
+	cfi_offset(rSHR, -32)
 	beq	cr5, L(DWunaligned)
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
-	sub	r27, rSTR2, rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
 	clrrdi	rSTR2, rSTR2, 3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-/* Compute the left/right shift counts for the unalign rSTR2,
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
-	clrldi	rSHL, r27, 61
+	clrldi	rSHL, rWORD8_SHIFT, 61
 	clrrdi	rSTR1, rSTR1, 3
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
 	sldi	rSHL, rSHL, 3
-	cmpld	cr5, r27, rSTR2
-	add	rN, rN, rBITDIF
-	sldi	r11, rBITDIF, 3
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
 	subfic	rSHR, rSHL, 64
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a DW where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD8, 0(rSTR2)
-	la	rSTR2, 8(rSTR2)
+	addi	rSTR2, rSTR2, 8
+#endif
 	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
-	srd	rG, rWORD2, rSHR
+	srd	r12, rWORD2, rSHR
 	clrldi	rN, rN, 61
 	beq	L(duPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
 	bgt	cr1, L(duPs3)
 	beq	cr1, L(duPs2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dusP1):
-	sld	rB, rWORD2, rSHL
-	sld	rWORD7, rWORD1, r11
-	sld	rWORD8, rWORD8, r11
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
 	bge	cr7, L(duP1e)
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
@@ -585,95 +794,133 @@ L(dusP1):
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(duPs2):
-	sld	rH, rWORD2, rSHL
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD8, r11
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(duPs3):
-	sld	rF, rWORD2, rSHL
-	sld	rWORD3, rWORD1, r11
-	sld	rWORD4, rWORD8, r11
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(duPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
-	sld	rD, rWORD2, rSHL
-	sld	rWORD1, rWORD1, r11
-	sld	rWORD2, rWORD8, r11
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is double word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(DWunaligned):
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
 	clrrdi	rSTR2, rSTR2, 3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
 	sldi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD6, 0(rSTR2)
 	ldu	rWORD8, 8(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	subfic	rSHR, rSHL, 64
-	sld	rH, rWORD6, rSHL
+	sld	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(duP3)
 	beq	cr1, L(duP2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duP1):
-	srd	rG, rWORD8, rSHR
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD7, 0(rSTR1)
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+#endif
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
-	or	rWORD4, rC, rD
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	bne	cr0, L(duLcr0)
-	or	rWORD6, rE, rF
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
 	cmpld	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
-	.align 4
+	.align	4
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
@@ -683,186 +930,321 @@ L(duP1x):
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(duP2):
-	srd	rE, rWORD8, rSHR
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD5, 0(rSTR1)
-	or	rWORD6, rE, rH
-	sld	rH, rWORD8, rSHL
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 24(rSTR1)
 	ld	rWORD4, 24(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
-	.align 4
+	.align	4
 L(duP2x):
 	cmpld	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(duP3):
-	srd	rC, rWORD8, rSHR
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD3, 0(rSTR1)
-	sld	rF, rWORD8, rSHL
-	or	rWORD4, rC, rH
+#endif
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 8(rSTR1)
 	ld	rWORD6, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 16(rSTR1)
 	ld	rWORD8, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 24(rSTR1)
 	ld	rWORD2, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
-	.align 4
+	.align	4
 L(duP3x):
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(duP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	srd	rA, rWORD8, rSHR
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD1, 0(rSTR1)
-	sld	rD, rWORD8, rSHL
-	or	rWORD2, rA, rH
+#endif
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 16(rSTR1)
 	ld	rWORD6, 16(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	bne	cr0, L(duLcr0)
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 24(rSTR1)
 	ldu	rWORD8, 24(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	cmpld	cr5, rWORD7, rWORD8
 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(duLcr0)
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	bne-	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz+	L(duLoop)
 
 L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	cmpld	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0, L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
 	bne	cr1, L(duLcr1)
 L(du24):
@@ -872,103 +1254,110 @@ L(du14):
 	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
    shift right double to eliminate bits beyond the compare length.
-   This allows the use of double word subtract to compute the final
-   result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
+   rWORD8_SHIFT).  */
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
-	.align 4
+#endif
+	srd	r0, rWORD2, rSHR
+	.align	4
 L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+#else
 	ld	rWORD1, 8(rSTR1)
-	ld	rWORD8,-8(r1)
+#endif
+	ld	rWORD8, -8(r1)
 	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
-	or	rWORD2, rA, rB
-	ld	rWORD7,-16(r1)
-	ld	r29,-24(r1)
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, -16(r1)
+	ld	rSHL, -24(r1)
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
+	ld	rSHR, -32(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 	li	rRTN, 0
-	cmpld	cr0, rWORD1, rWORD2
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
- 	beq	cr0, L(dureturn24)
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, -48(r1)
+	ld	rWORD4_SHIFT, -56(r1)
+	beq	cr7, L(dureturn24)
 	li	rRTN, 1
-	ld	r24,-64(r1)
-	bgtlr	cr0
+	ld	rWORD6_SHIFT, -64(r1)
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
-L(duLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	.align	4
+L(duLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
-	bgt	cr0, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dureturn29):
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 L(dureturn27):
-	ld	r27,-40(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 L(dureturn26):
-	ld	r26,-48(r1)
+	ld	rWORD2_SHIFT, -48(r1)
 L(dureturn25):
-	ld	r25,-56(r1)
+	ld	rWORD4_SHIFT, -56(r1)
 L(dureturn24):
-	ld	r24,-64(r1)
+	ld	rWORD6_SHIFT, -64(r1)
 	blr
 L(duzeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 END (memcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index f190c64..6851cdc 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -23,10 +23,9 @@
 		    size_t size [r5])  */
 
 	.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,354 +36,557 @@ EALIGN (memcmp,4,0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP,rSTR2,rSTR1
-	cmpldi	cr6,rN,0
-	cmpldi	cr1,rN,12
-	clrldi.	rTMP,rTMP,61
-	clrldi	rBITDIF,rSTR1,61
-	cmpldi	cr5,rBITDIF,0
-	beq-	cr6,L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	xor	r0, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 12
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
-	blt	cr1,L(bytealigned)
-	std	rWORD8,-8(r1)
-	cfi_offset(rWORD8,-8)
-	std	rWORD7,-16(r1)
-	cfi_offset(rWORD7,-16)
+	blt	cr1, L(bytealigned)
+	std	rWORD8, -8(r1)
+	cfi_offset(rWORD8, -8)
+	std	rWORD7, -16(r1)
+	cfi_offset(rWORD7, -16)
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
-   aligned and can perform the DWaligned loop.
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
-   yet DW).  So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
-   normal (DWaligned) compare loop, starting at the second double word,
+   normal (DW aligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
-   correct and the first DW (shifted) is in the expected resister pair.  */
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
 	.align	4
 L(samealignment):
-	clrrdi	rSTR1,rSTR1,3
-	clrrdi	rSTR2,rSTR2,3
-	beq	cr5,L(DWaligned)
-	add	rN,rN,rBITDIF
-	sldi	r11,rBITDIF,3
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
 	beq	L(dPs4)
-	mtctr	rTMP
-	bgt	cr1,L(dPs3)
-	beq	cr1,L(dPs2)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
 
 /* Remainder is 8 */
 	.align	3
 L(dsP1):
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD2,r11
-	cmpld	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
 	.align	4
 L(dPs2):
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD2,r11
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
 	.align	4
 L(dPs3):
-	sld	rWORD3,rWORD1,r11
-	sld	rWORD4,rWORD2,r11
-	cmpld	cr1,rWORD3,rWORD4
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(dPs4):
-	mtctr	rTMP
-	sld	rWORD1,rWORD1,r11
-	sld	rWORD2,rWORD2,r11
-	cmpld	cr0,rWORD1,rWORD2
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are double word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWaligned):
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
 	beq	L(dP4)
-	bgt	cr1,L(dP3)
-	beq	cr1,L(dP2)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
 
 /* Remainder is 8 */
 	.align	4
 L(dP1):
-	mtctr	rTMP
+	mtctr	r0
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-	ld	rWORD5,0(rSTR1)
-	ld	rWORD6,0(rSTR2)
-	cmpld	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+	ld	rWORD6, 0(rSTR2)
+#endif
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	bne	cr0,L(dLcr0)
-
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
-	bne	cr6,L(dLcr6)
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	.align	3
 L(dP1x):
-	sldi.	r12,rN,3
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 16 */
 	.align	4
 L(dP2):
-	mtctr	rTMP
-	ld	rWORD5,0(rSTR1)
-	ld	rWORD6,0(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+	ld	rWORD6, 0(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	ld	rWORD3,24(rSTR1)
-	ld	rWORD4,24(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(dLcr6)
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 24(rSTR1)
+	ld	rWORD4, 24(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP2x):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr5,rWORD3,rWORD4
-	sldi.	r12,rN,3
-	bne	cr6,L(dLcr6)
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 24 */
 	.align	4
 L(dP3):
-	mtctr	rTMP
-	ld	rWORD3,0(rSTR1)
-	ld	rWORD4,0(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 0(rSTR1)
+	ld	rWORD4, 0(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
-	ld	rWORD5,8(rSTR1)
-	ld	rWORD6,8(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP3x)
-	ld	rWORD7,16(rSTR1)
-	ld	rWORD8,16(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	ld	rWORD1,24(rSTR1)
-	ld	rWORD2,24(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr1,L(dLcr1)
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 8(rSTR1)
+	ld	rWORD6, 8(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 16(rSTR1)
+	ld	rWORD8, 16(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 24(rSTR1)
+	ld	rWORD2, 24(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP3x):
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr5,rWORD1,rWORD2
-	sldi.	r12,rN,3
-	bne	cr1,L(dLcr1)
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr6,L(dLcr6)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(dP4):
-	mtctr	rTMP
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	ld	rWORD5,16(rSTR1)
-	ld	rWORD6,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	ldu	rWORD7,24(rSTR1)
-	ldu	rWORD8,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
-	bne	cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 16(rSTR1)
+	ld	rWORD6, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 24(rSTR1)
+	ldu	rWORD8, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
 L(dLoop1):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
 L(dLoop2):
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
 L(dLoop3):
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
 
 L(dL4):
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	cmpld	cr5,rWORD7,rWORD8
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0,L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
-	bne	cr1,L(dLcr1)
+	bne	cr1, L(dLcr1)
 L(d24):
-	bne	cr6,L(dLcr6)
+	bne	cr6, L(dLcr6)
 L(d14):
-	sldi.	r12,rN,3
-	bne	cr5,L(dLcr5)
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	srd	rWORD1,rWORD1,rN
-	srd	rWORD2,rWORD2,rN
-	cmpld	cr5,rWORD1,rWORD2
-	bne	cr5,L(dLcr5x)
-	li	rRTN,0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	li	rRTN, 0
 	blr
+
 	.align	4
-L(dLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(dLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr1x):
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr6x):
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dLcr5x):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr5
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 	.align	4
 L(bytealigned):
 	mtctr	rN
-	beq	cr6,L(zeroLength)
+#if 0
+/* Huh?  We've already branched on cr6!  */
+	beq	cr6, L(zeroLength)
+#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -396,38 +598,38 @@ L(bytealigned):
    So we must precondition some registers and condition codes so that
    we don't exit the loop early on the first iteration.  */
 
-	lbz	rWORD1,0(rSTR1)
-	lbz	rWORD2,0(rSTR2)
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
 	bdz	L(b11)
-	cmpld	cr0,rWORD1,rWORD2
-	lbz	rWORD3,1(rSTR1)
-	lbz	rWORD4,1(rSTR2)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
 	bdz	L(b12)
-	cmpld	cr1,rWORD3,rWORD4
-	lbzu	rWORD5,2(rSTR1)
-	lbzu	rWORD6,2(rSTR2)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
 	bdz	L(b13)
 	.align	4
 L(bLoop):
-	lbzu	rWORD1,1(rSTR1)
-	lbzu	rWORD2,1(rSTR2)
-	bne	cr0,L(bLcr0)
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
 
-	cmpld	cr6,rWORD5,rWORD6
+	cmpld	cr6, rWORD5, rWORD6
 	bdz	L(b3i)
 
-	lbzu	rWORD3,1(rSTR1)
-	lbzu	rWORD4,1(rSTR2)
-	bne	cr1,L(bLcr1)
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
 
-	cmpld	cr0,rWORD1,rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdz	L(b2i)
 
-	lbzu	rWORD5,1(rSTR1)
-	lbzu	rWORD6,1(rSTR2)
-	bne	cr6,L(bLcr6)
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
 
-	cmpld	cr1,rWORD3,rWORD4
+	cmpld	cr1, rWORD3, rWORD4
 	bdnz	L(bLoop)
 
 /* We speculatively loading bytes before we have tested the previous
@@ -437,542 +639,727 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne	cr0,L(bLcr0)
-	bne	cr1,L(bLcr1)
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
 	b	L(bx56)
 	.align	4
 L(b2i):
-	bne	cr6,L(bLcr6)
-	bne	cr0,L(bLcr0)
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
 	b	L(bx34)
 	.align	4
 L(b3i):
-	bne	cr1,L(bLcr1)
-	bne	cr6,L(bLcr6)
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
 	b	L(bx12)
 	.align	4
-L(bLcr0):
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 L(bLcr1):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 L(bLcr6):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 L(b13):
-	bne	cr0,L(bx12)
-	bne	cr1,L(bx34)
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
 L(bx56):
-	sub	rRTN,rWORD5,rWORD6
+	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne	cr0,L(bx12)
+	bne	cr7, L(bx12)
 L(bx34):
-	sub	rRTN,rWORD3,rWORD4
+	sub	rRTN, rWORD3, rWORD4
 	blr
 L(b11):
 L(bx12):
-	sub	rRTN,rWORD1,rWORD2
+	sub	rRTN, rWORD1, rWORD2
 	blr
 	.align	4
-L(zeroLengthReturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
 L(zeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
    aligned and can perform the DWunaligned loop.
 
    Otherwise we know that rSTR1 is not already DW aligned yet.
    So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
    normal (DWaligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
+   versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL	r29	/* Unaligned shift left count.  */
-#define rSHR	r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	r29,-24(r1)
-	cfi_offset(r29,-24)
-	clrldi	rSHL,rSTR2,61
-	beq	cr6,L(duzeroLength)
-	std	r28,-32(r1)
-	cfi_offset(r28,-32)
-	beq	cr5,L(DWunaligned)
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
+	std	rSHL, -24(r1)
+	cfi_offset(rSHL, -24)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, -32(r1)
+	cfi_offset(rSHR, -32)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
-	sub	r27,rSTR2,rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
-	clrrdi	rSTR2,rSTR2,3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
 /* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
-	clrldi	rSHL,r27,61
-	clrrdi	rSTR1,rSTR1,3
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	sldi	rSHL,rSHL,3
-	cmpld	cr5,r27,rSTR2
-	add	rN,rN,rBITDIF
-	sldi	r11,rBITDIF,3
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
-	subfic	rSHR,rSHL,64
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a DW where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
-	li	rWORD8,0
-	blt	cr5,L(dus0)
-	ld	rWORD8,0(rSTR2)
-	la	rSTR2,8(rSTR2)
-	sld	rWORD8,rWORD8,rSHL
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 8
+#endif
+	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	srd	rG,rWORD2,rSHR
-	clrldi	rN,rN,61
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
 	beq	L(duPs4)
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	bgt	cr1,L(duPs3)
-	beq	cr1,L(duPs2)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
 
 /* Remainder is 8 */
 	.align	4
 L(dusP1):
-	sld	rB,rWORD2,rSHL
-	sld	rWORD7,rWORD1,r11
-	sld	rWORD8,rWORD8,r11
-	bge	cr7,L(duP1e)
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
-	cmpld	cr5,rWORD7,rWORD8
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duPs2):
-	sld	rH,rWORD2,rSHL
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD8,r11
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 24 */
 	.align	4
 L(duPs3):
-	sld	rF,rWORD2,rSHL
-	sld	rWORD3,rWORD1,r11
-	sld	rWORD4,rWORD8,r11
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(duPs4):
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	sld	rD,rWORD2,rSHL
-	sld	rWORD1,rWORD1,r11
-	sld	rWORD2,rWORD8,r11
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is double word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWunaligned):
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-	clrrdi	rSTR2,rSTR2,3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
-	sldi	rSHL,rSHL,3
-	ld	rWORD6,0(rSTR2)
-	ldu	rWORD8,8(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
-	subfic	rSHR,rSHL,64
-	sld	rH,rWORD6,rSHL
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
+	sldi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD6, 0(rSTR2)
+	ldu	rWORD8, 8(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr	rTMP
-	bgt	cr1,L(duP3)
-	beq	cr1,L(duP2)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
 
 /* Remainder is 8 */
 	.align	4
 L(duP1):
-	srd	rG,rWORD8,rSHR
-	ld	rWORD7,0(rSTR1)
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP1x)
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD7, 0(rSTR1)
+#endif
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
 L(duP1e):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	bne	cr5,L(duLcr5)
-	or	rWORD4,rC,rD
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	bne	cr0,L(duLcr0)
-	or	rWORD6,rE,rF
-	cmpld	cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
 	.align	4
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
 L(duP1x):
-	cmpld	cr5,rWORD7,rWORD8
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duP2):
-	srd	rE,rWORD8,rSHR
-	ld	rWORD5,0(rSTR1)
-	or	rWORD6,rE,rH
-	sld	rH,rWORD8,rSHL
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP2x)
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	ld	rWORD3,24(rSTR1)
-	ld	rWORD4,24(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	bne	cr5,L(duLcr5)
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	cmpld	cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 24(rSTR1)
+	ld	rWORD4, 24(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
-	cmpld	cr5,rWORD7,rWORD8
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(duLcr6)
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 24 */
 	.align	4
 L(duP3):
-	srd	rC,rWORD8,rSHR
-	ld	rWORD3,0(rSTR1)
-	sld	rF,rWORD8,rSHL
-	or	rWORD4,rC,rH
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD3, 0(rSTR1)
+#endif
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-	ld	rWORD5,8(rSTR1)
-	ld	rWORD6,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	ld	rWORD7,16(rSTR1)
-	ld	rWORD8,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP3x)
-	ld	rWORD1,24(rSTR1)
-	ld	rWORD2,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 8(rSTR1)
+	ld	rWORD6, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 16(rSTR1)
+	ld	rWORD8, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 24(rSTR1)
+	ld	rWORD2, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr1,L(duLcr1)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(duP4):
-	mtctr	rTMP
-	srd	rA,rWORD8,rSHR
-	ld	rWORD1,0(rSTR1)
-	sld	rD,rWORD8,rSHL
-	or	rWORD2,rA,rH
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+#endif
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	ld	rWORD5,16(rSTR1)
-	ld	rWORD6,16(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr0,L(duLcr0)
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	ldu	rWORD7,24(rSTR1)
-	ldu	rWORD8,24(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 16(rSTR1)
+	ld	rWORD6, 16(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 24(rSTR1)
+	ldu	rWORD8, 24(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(duLcr0)
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	bne-	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz	L(duLoop)
 
 L(duL4):
-	bne	cr1,L(duLcr1)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	cmpld	cr5,rWORD7,rWORD8
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0,L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
-	bne	cr1,L(duLcr1)
+	bne	cr1, L(duLcr1)
 L(du24):
-	bne	cr6,L(duLcr6)
+	bne	cr6, L(duLcr6)
 L(du14):
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
    shift right double to eliminate bits beyond the compare length.
-   This allows the use of double word subtract to compute the final
-   result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
-	cmpld	cr7,rN,rSHR
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD8,-8(r1)
-	subfic	rN,rN,64	/* Shift count is 64 - (rN * 8).  */
-	or	rWORD2,rA,rB
-	ld	rWORD7,-16(r1)
-	ld	r29,-24(r1)
-	srd	rWORD1,rWORD1,rN
-	srd	rWORD2,rWORD2,rN
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	li	rRTN,0
-	cmpld	cr0,rWORD1,rWORD2
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-	beq	cr0,L(dureturn24)
-	li	rRTN,1
-	ld	r24,-64(r1)
-	bgtlr	cr0
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+#else
+	ld	rWORD1, 8(rSTR1)
+#endif
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, -16(r1)
+	ld	rSHL, -24(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, -32(r1)
+	ld	rWORD8_SHIFT, -40(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, -48(r1)
+	ld	rWORD4_SHIFT, -56(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, -64(r1)
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
-L(duLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr0,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+L(duLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr1,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr6,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr5,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dureturn29):
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 L(dureturn27):
-	ld	r27,-40(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 L(dureturn26):
-	ld	r26,-48(r1)
+	ld	rWORD2_SHIFT, -48(r1)
 L(dureturn25):
-	ld	r25,-56(r1)
+	ld	rWORD4_SHIFT, -56(r1)
 L(dureturn24):
-	ld	r24,-64(r1)
+	ld	rWORD6_SHIFT, -64(r1)
 	blr
 L(duzeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 END (memcmp)
 libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=477a5dffbfce9b839682fc716e590a9b89bba86c

commit 477a5dffbfce9b839682fc716e590a9b89bba86c
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:01:25 2013 -0500

    PowerPC: LE strlen
    
    This one is LE support for strlen.  Like most of the string functions,
    I leave the main word or multiple-word loops substantially unchanged,
    just needing to modify the tail.
    
    Removing the branch in the power7 functions is just a tidy.  .align
    produces a branch anyway.  Modifying regs in the non-power7 functions
    is to suit the new little-endian tail.

diff --git a/sysdeps/powerpc/powerpc32/power7/strlen.S b/sysdeps/powerpc/powerpc32/power7/strlen.S
index b71a10f..b08d6c0 100644
--- a/sysdeps/powerpc/powerpc32/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc32/power7/strlen.S
@@ -29,7 +29,11 @@ ENTRY (strlen)
 	li	r0,0	      /* Word with null chars to use with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	lwz	r12,0(r4)     /* Load word from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	r5,r5,r6
+#else
 	srw	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in WORD1.  */
 	cmpwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@@ -47,9 +51,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpwi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */
 
 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@@ -86,9 +87,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntw r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
diff --git a/sysdeps/powerpc/powerpc32/strlen.S b/sysdeps/powerpc/powerpc32/strlen.S
index 9a6eafc..a7153ed 100644
--- a/sysdeps/powerpc/powerpc32/strlen.S
+++ b/sysdeps/powerpc/powerpc32/strlen.S
@@ -29,7 +29,12 @@
       1 is subtracted you get a value in the range 0x00-0x7f, none of which
       have their high bit set. The expression here is
       (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.
 
    2) Given a word 'x', we can test to see _which_ byte was zero by
       calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -72,7 +77,7 @@
 
 ENTRY (strlen)
 
-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@@ -82,9 +87,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string word */
 #define rWORD2	r9	/* next string word */
 #define rMASK	r9	/* mask for first string word */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12
 
 
 	clrrwi	rSTR, rRTN, 2
@@ -93,15 +98,20 @@ ENTRY (strlen)
 	lwz	rWORD1, 0(rSTR)
 	li	rMASK, -1
 	addi	r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
-   We make an exception and use method (2) on the first two words, to reduce
-   overhead.  */
+/* We use method (2) on the first two words, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rPADN
+#else
 	srw	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
 	lis	rFEFE, -0x101
@@ -110,11 +120,12 @@ ENTRY (strlen)
 	bt	29, L(loop)
 
 /* Handle second word of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	lwzu	rWORD1, 4(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)
 
 /* The loop.  */
@@ -128,28 +139,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)
 
+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 4
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1
 
 /* When we get to here, rSTR points to the first word in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
+   and 0x00 otherwise.  */
 L(done0):
-	cntlzw	rTMP3, rWORD1
+	cntlzw	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srwi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 32-7
+	srwi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzw	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 32-7-32
+	srawi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S
index 3432169..807ef10 100644
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -30,7 +30,11 @@ ENTRY (strlen)
 				 with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
 	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
 	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@@ -48,9 +52,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpdi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */
 
 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@@ -87,9 +88,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntd r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
diff --git a/sysdeps/powerpc/powerpc64/strlen.S b/sysdeps/powerpc/powerpc64/strlen.S
index 0f9b5ee..4ed1ba3 100644
--- a/sysdeps/powerpc/powerpc64/strlen.S
+++ b/sysdeps/powerpc/powerpc64/strlen.S
@@ -29,7 +29,12 @@
       1 is subtracted you get a value in the range 0x00-0x7f, none of which
       have their high bit set. The expression here is
       (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.
 
    2) Given a word 'x', we can test to see _which_ byte was zero by
       calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -62,7 +67,7 @@
    Answer:
    1) Added a Data Cache Block Touch early to prefetch the first 128
    byte cache line. Adding dcbt instructions to the loop would not be
-   effective since most strings will be shorter than the cache line.*/
+   effective since most strings will be shorter than the cache line.  */
 
 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
    0 and 3 through 12 (so long as we don't call any procedures) without
@@ -78,7 +83,7 @@
 ENTRY (strlen)
 	CALL_MCOUNT 1
 
-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@@ -88,9 +93,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string doubleword */
 #define rWORD2	r9	/* next string doubleword */
 #define rMASK	r9	/* mask for first string doubleword */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12
 
 	dcbt	0,rRTN
 	clrrdi	rSTR, rRTN, 3
@@ -100,30 +105,36 @@ ENTRY (strlen)
 	addi	r7F7F, r7F7F, 0x7f7f
 	li	rMASK, -1
 	insrdi	r7F7F, r7F7F, 32, 0
-/* That's the setup done, now do the first pair of doublewords.
-   We make an exception and use method (2) on the first two doublewords,
-   to reduce overhead.  */
+/* We use method (2) on the first two doublewords, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	rMASK, rMASK, rPADN
+#else
 	srd	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	lis	rFEFE, -0x101
 	add	rTMP1, rTMP1, r7F7F
 	addi	rFEFE, rFEFE, -0x101
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
-	sldi  rTMP1, rFEFE, 32
-	add  rFEFE, rFEFE, rTMP1
+	sldi	rTMP1, rFEFE, 32
+	add	rFEFE, rFEFE, rTMP1
 /* Are we now aligned to a doubleword boundary?  */
 	bt	28, L(loop)
 
 /* Handle second doubleword of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	ldu	rWORD1, 8(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)
 
 /* The loop.  */
@@ -137,28 +148,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)
 
+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 8
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1
 
 /* When we get to here, rSTR points to the first doubleword in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
+   otherwise.  */
 L(done0):
-	cntlzd	rTMP3, rWORD1
+	cntlzd	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srdi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzd	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 64-7
+	srdi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzd	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 64-7-64
+	sradi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9da8004533e94932815d7f7ea1ecd791a83bd5a4

commit 9da8004533e94932815d7f7ea1ecd791a83bd5a4
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:01:06 2013 -0500

    PowerPC: LE strcpy
    
    The strcpy changes for little-endian are quite straight-forward, just
    a matter of rotating the last word differently.

diff --git a/sysdeps/powerpc/powerpc32/stpcpy.S b/sysdeps/powerpc/powerpc32/stpcpy.S
index 03c6ddd..7e106e0 100644
--- a/sysdeps/powerpc/powerpc32/stpcpy.S
+++ b/sysdeps/powerpc/powerpc32/stpcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stbu	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stbu	rALT, 1(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/strcpy.S b/sysdeps/powerpc/powerpc32/strcpy.S
index 4ae577d..e938cc4 100644
--- a/sysdeps/powerpc/powerpc32/strcpy.S
+++ b/sysdeps/powerpc/powerpc32/strcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stb	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stb	rTMP, 5(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stb	rTMP, 6(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stb	rTMP, 7(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stb	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stb	rALT, 7(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
index 070cd46..c0b3972 100644
--- a/sysdeps/powerpc/powerpc64/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/stpcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stbu	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stbu	rALT, 1(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
index 4c6fd3f..a7fd85b 100644
--- a/sysdeps/powerpc/powerpc64/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/strcpy.S
@@ -68,6 +68,32 @@ L(g2):	add	rTMP, rFEFE, rWORD
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
 L(g1):
+#ifdef __LITTLE_ENDIAN__
+	extrdi.	rTMP, rALT, 8, 56
+	stb	rALT, 8(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stb	rTMP, 9(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 40
+	stb	rTMP, 10(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stb	rTMP, 11(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stb	rTMP, 12(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stb	rTMP, 13(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stb	rTMP, 14(rDEST)
+	beqlr-
+	extrdi	rTMP, rALT, 8, 0
+	stb	rTMP, 15(rDEST)
+	blr
+#else
 	extrdi.	rTMP, rALT, 8, 0
 	stb	rTMP, 8(rDEST)
 	beqlr-
@@ -91,6 +117,7 @@ L(g1):
 	beqlr-
 	stb	rALT, 15(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3bda0ef3e03e63f3bbab893eb31a5c71811522fa

commit 3bda0ef3e03e63f3bbab893eb31a5c71811522fa
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:00:51 2013 -0500

    PowerPC: LE setjmp/longjmp
    
    Little-endian fixes for setjmp/longjmp.  When writing these I noticed
    the setjmp code corrupts the non volatile VMX registers when using an
    unaligned buffer.  Anton fixed this, and also simplified it quite a
    bit.
    
    The current code uses boilerplate for the case where we want to store
    16 bytes to an unaligned address.  For that we have to do a
    read/modify/write of two aligned 16 byte quantities.  In our case we
    are storing a bunch of back to back data (consective VMX registers),
    and only the start and end of the region need the read/modify/write.

diff --git a/sysdeps/powerpc/jmpbuf-offsets.h b/sysdeps/powerpc/jmpbuf-offsets.h
index 64c658a..f2116bd 100644
--- a/sysdeps/powerpc/jmpbuf-offsets.h
+++ b/sysdeps/powerpc/jmpbuf-offsets.h
@@ -21,12 +21,10 @@
 #define JB_LR     2  /* The address we will return to */
 #if __WORDSIZE == 64
 # define JB_GPRS   3  /* GPRs 14 through 31 are saved, 18*2 words total.  */
-# define JB_CR     21 /* Condition code registers with the VRSAVE at */
-                       /* offset 172 (low half of the double word.  */
+# define JB_CR     21 /* Shared dword with VRSAVE.  CR word at offset 172.  */
 # define JB_FPRS   22 /* FPRs 14 through 31 are saved, 18*2 words total.  */
 # define JB_SIZE   (64 * 8) /* As per PPC64-VMX ABI.  */
-# define JB_VRSAVE 21 /* VRSAVE shares a double word with the CR at offset */
-                       /* 168 (high half of the double word).  */
+# define JB_VRSAVE 21 /* Shared dword with CR.  VRSAVE word at offset 168.  */
 # define JB_VRS    40 /* VRs 20 through 31 are saved, 12*4 words total.  */
 #else
 # define JB_GPRS   3  /* GPRs 14 through 31 are saved, 18 in total.  */
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 9d34cd9..d02aa57 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -43,16 +43,16 @@ ENTRY (__longjmp)
 #   endif
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r5,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r5)
+	lis	r5,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r5)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index 46ea2b0..f324406 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -94,14 +94,14 @@ ENTRY (__sigsetjmp)
 #   else
 	lwz     r5,_rtld_global_ro@got(r5)
 #   endif
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r6,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r6)
+	lis	r6,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r6)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
@@ -111,44 +111,43 @@ ENTRY (__sigsetjmp)
 	stw	r0,((JB_VRSAVE)*4)(3)
 	addi	r6,r5,16
 	beq+	L(aligned_save_vmx)
-	lvsr	v0,0,r5
-	vspltisb v1,-1         /* set v1 to all 1's */
-	vspltisb v2,0          /* set v2 to all 0's */
-	vperm   v3,v2,v1,v0   /* v3 contains shift mask with num all 1 bytes on left = misalignment  */
 
+	lvsr	v0,0,r5
+	lvsl	v1,0,r5
+	addi	r6,r5,-16
 
-	/* Special case for v20 we need to preserve what is in save area below v20 before obliterating it */
-	lvx     v5,0,r5
-	vperm   v20,v20,v20,v0
-	vsel    v5,v5,v20,v3
-	vsel    v20,v20,v2,v3
-	stvx    v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+	addi	addgpr,addgpr,32;					 \
+	vperm	tmpvr,prevvr,savevr,shiftvr;				 \
+	stvx	tmpvr,0,savegpr
 
-#define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
-	addi    addgpr,addgpr,32; \
-	vperm   savevr,savevr,savevr,shiftvr; \
-	vsel    hivr,prev_savevr,savevr,maskvr; \
-	stvx    hivr,0,savegpr;
+	/*
+	 * We have to be careful not to corrupt the data below v20 and
+	 * above v31. To keep things simple we just rotate both ends in
+	 * the opposite direction to our main permute so we can use
+	 * the common macro.
+	 */
 
-	save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+	/* load and rotate data below v20 */
+	lvx	v2,0,r5
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+	save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+	save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+	save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+	save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+	save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+	save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+	save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+	save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+	save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+	save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+	save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+	/* load and rotate data above v31 */
+	lvx	v2,0,r6
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
 
-	/* Special case for r31 we need to preserve what is in save area above v31 before obliterating it */
-	addi    r5,r5,32
-	vperm   v31,v31,v31,v0
-	lvx     v4,0,r5
-	vsel    v5,v30,v31,v3
-	stvx    v5,0,r6
-	vsel    v4,v31,v4,v3
-	stvx    v4,0,r5
 	b	L(no_vmx)
 
 L(aligned_save_vmx):
diff --git a/sysdeps/powerpc/powerpc64/__longjmp-common.S b/sysdeps/powerpc/powerpc64/__longjmp-common.S
index 70c3704..21ff50f 100644
--- a/sysdeps/powerpc/powerpc64/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/__longjmp-common.S
@@ -153,7 +153,7 @@ L(no_vmx):
 	lfd fp21,((JB_FPRS+7)*8)(r3)
 	ld r22,((JB_GPRS+8)*8)(r3)
 	lfd fp22,((JB_FPRS+8)*8)(r3)
-	ld r0,(JB_CR*8)(r3)
+	lwz r0,((JB_CR*8)+4)(r3)
 	ld r23,((JB_GPRS+9)*8)(r3)
 	lfd fp23,((JB_FPRS+9)*8)(r3)
 	ld r24,((JB_GPRS+10)*8)(r3)
diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S
index 58ec610..1c8b7cb 100644
--- a/sysdeps/powerpc/powerpc64/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/setjmp-common.S
@@ -95,7 +95,7 @@ JUMPTARGET(GLUE(__sigsetjmp,_ent)):
 	mfcr r0
 	std  r16,((JB_GPRS+2)*8)(3)
 	stfd fp16,((JB_FPRS+2)*8)(3)
-	std  r0,(JB_CR*8)(3)
+	stw  r0,((JB_CR*8)+4)(3)
 	std  r17,((JB_GPRS+3)*8)(3)
 	stfd fp17,((JB_FPRS+3)*8)(3)
 	std  r18,((JB_GPRS+4)*8)(3)
@@ -142,47 +142,43 @@ JUMPTARGET(GLUE(__sigsetjmp,_ent)):
 	stw	r0,((JB_VRSAVE)*8)(3)
 	addi	r6,r5,16
 	beq+	L(aligned_save_vmx)
-	lvsr	v0,0,r5
-	vspltisb v1,-1         /* set v1 to all 1's */
-	vspltisb v2,0          /* set v2 to all 0's */
-	vperm   v3,v2,v1,v0   /* v3 contains shift mask with num all 1 bytes
-				 on left = misalignment  */
 
+	lvsr	v0,0,r5
+	lvsl	v1,0,r5
+	addi	r6,r5,-16
 
-	/* Special case for v20 we need to preserve what is in save area
-	   below v20 before obliterating it */
-	lvx     v5,0,r5
-	vperm   v20,v20,v20,v0
-	vsel    v5,v5,v20,v3
-	vsel    v20,v20,v2,v3
-	stvx    v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+	addi	addgpr,addgpr,32;					 \
+	vperm	tmpvr,prevvr,savevr,shiftvr;				 \
+	stvx	tmpvr,0,savegpr
 
-# define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
-	addi    addgpr,addgpr,32; \
-	vperm   savevr,savevr,savevr,shiftvr; \
-	vsel    hivr,prev_savevr,savevr,maskvr; \
-	stvx    hivr,0,savegpr;
+	/*
+	 * We have to be careful not to corrupt the data below v20 and
+	 * above v31. To keep things simple we just rotate both ends in
+	 * the opposite direction to our main permute so we can use
+	 * the common macro.
+	 */
 
-	save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+	/* load and rotate data below v20 */
+	lvx	v2,0,r5
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+	save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+	save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+	save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+	save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+	save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+	save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+	save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+	save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+	save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+	save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+	save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+	/* load and rotate data above v31 */
+	lvx	v2,0,r6
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
 
-	/* Special case for r31 we need to preserve what is in save area
-	   above v31 before obliterating it */
-	addi    r5,r5,32
-	vperm   v31,v31,v31,v0
-	lvx     v4,0,r5
-	vsel    v5,v30,v31,v3
-	stvx    v5,0,r6
-	vsel    v4,v31,v4,v3
-	stvx    v4,0,r5
 	b	L(no_vmx)
 
 L(aligned_save_vmx):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e2685c784b36cb8beb2341cbe5b02939eaf01da4

commit e2685c784b36cb8beb2341cbe5b02939eaf01da4
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 15:00:29 2013 -0500

    PowerPC: ugly simbol versioning
    
    This patch fixes symbol versioning in setjmp/longjmp.  The existing
    code uses raw versions, which results in wrong symbol versioning when
    you want to build glibc with a base version of 2.19 for LE.
    
    Note that the merging the 64-bit and 32-bit versions in novmx-lonjmp.c
    and pt-longjmp.c doesn't result in GLIBC_2.0 versions for 64-bit, due
    to the base in shlib_versions.

diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c b/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
index ace858f..4ac913c 100644
--- a/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
+++ b/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
@@ -41,13 +41,8 @@ void __novmx_longjmp (jmp_buf env, int val)
   __novmx__libc_longjmp (env, val);
 }
 
-# if __WORDSIZE == 64
-symbol_version (__novmx_longjmp,longjmp,GLIBC_2.3);
-symbol_version (__novmx_siglongjmp,siglongjmp,GLIBC_2.3);
-# else
-symbol_version (__novmx_longjmp,longjmp,GLIBC_2.0);
-symbol_version (__novmx_siglongjmp,siglongjmp,GLIBC_2.0);
-# endif
+compat_symbol (libpthread, __novmx_longjmp, longjmp, GLIBC_2_0);
+compat_symbol (libpthread, __novmx_siglongjmp, siglongjmp, GLIBC_2_0);
 #endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4))  */
 
 void
@@ -62,5 +57,5 @@ __vmx_siglongjmp (jmp_buf env, int val)
   __libc_siglongjmp (env, val);
 }
 
-versioned_symbol (libc, __vmx_longjmp, longjmp, GLIBC_2_3_4);
-versioned_symbol (libc, __vmx_siglongjmp, siglongjmp, GLIBC_2_3_4);
+versioned_symbol (libpthread, __vmx_longjmp, longjmp, GLIBC_2_3_4);
+versioned_symbol (libpthread, __vmx_siglongjmp, siglongjmp, GLIBC_2_3_4);
diff --git a/sysdeps/powerpc/longjmp.c b/sysdeps/powerpc/longjmp.c
index 198c894..189fc03 100644
--- a/sysdeps/powerpc/longjmp.c
+++ b/sysdeps/powerpc/longjmp.c
@@ -55,6 +55,6 @@ weak_alias (__vmx__libc_siglongjmp, __vmxsiglongjmp)
 
 default_symbol_version (__vmx__libc_longjmp, __libc_longjmp, GLIBC_PRIVATE);
 default_symbol_version (__vmx__libc_siglongjmp, __libc_siglongjmp, GLIBC_PRIVATE);
-default_symbol_version (__vmx_longjmp, _longjmp, GLIBC_2.3.4);
-default_symbol_version (__vmxlongjmp, longjmp, GLIBC_2.3.4);
-default_symbol_version (__vmxsiglongjmp, siglongjmp, GLIBC_2.3.4);
+versioned_symbol (libc, __vmx_longjmp, _longjmp, GLIBC_2_3_4);
+versioned_symbol (libc, __vmxlongjmp, longjmp, GLIBC_2_3_4);
+versioned_symbol (libc, __vmxsiglongjmp, siglongjmp, GLIBC_2_3_4);
diff --git a/sysdeps/powerpc/novmx-longjmp.c b/sysdeps/powerpc/novmx-longjmp.c
index 8f6ea35..b2c0e4c 100644
--- a/sysdeps/powerpc/novmx-longjmp.c
+++ b/sysdeps/powerpc/novmx-longjmp.c
@@ -50,13 +50,7 @@ weak_alias (__novmx__libc_siglongjmp, __novmx_longjmp)
 weak_alias (__novmx__libc_siglongjmp, __novmxlongjmp)
 weak_alias (__novmx__libc_siglongjmp, __novmxsiglongjmp)
 
-# if __WORDSIZE == 64
-symbol_version (__novmx_longjmp,_longjmp,GLIBC_2.3);
-symbol_version (__novmxlongjmp,longjmp,GLIBC_2.3);
-symbol_version (__novmxsiglongjmp,siglongjmp,GLIBC_2.3);
-# else
-symbol_version (__novmx_longjmp,_longjmp,GLIBC_2.0);
-symbol_version (__novmxlongjmp,longjmp,GLIBC_2.0);
-symbol_version (__novmxsiglongjmp,siglongjmp,GLIBC_2.0);
-# endif
+compat_symbol (libc, __novmx_longjmp, _longjmp, GLIBC_2_0);
+compat_symbol (libc, __novmxlongjmp, longjmp, GLIBC_2_0);
+compat_symbol (libc, __novmxsiglongjmp, siglongjmp, GLIBC_2_0);
 #endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4))  */
diff --git a/sysdeps/powerpc/powerpc32/bsd-_setjmp.S b/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
index 95e8a5a..ad2b5ff 100644
--- a/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
+++ b/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
@@ -30,7 +30,7 @@ libc_hidden_def (_setjmp)
 /* Build a versioned object for libc.  */
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx_setjmp, _setjmp, GLIBC_2_0);
 
 ENTRY (__novmx_setjmp)
 	li r4,0			/* Set second argument to 0.  */
@@ -39,7 +39,7 @@ END (__novmx_setjmp)
 libc_hidden_def (__novmx_setjmp)
 # endif /* defined SHARED  && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) */
 
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
 /* __GI__setjmp prototype is needed for ntpl i.e. _setjmp is defined
    as a libc_hidden_proto & is used in sysdeps/generic/libc-start.c
    if HAVE_CLEANUP_JMP_BUF is defined */
diff --git a/sysdeps/powerpc/powerpc32/bsd-setjmp.S b/sysdeps/powerpc/powerpc32/bsd-setjmp.S
index 1113ea5..5e1e860 100644
--- a/sysdeps/powerpc/powerpc32/bsd-setjmp.S
+++ b/sysdeps/powerpc/powerpc32/bsd-setjmp.S
@@ -26,7 +26,7 @@ ENTRY (__novmxsetjmp)
 	b __novmx__sigsetjmp@local
 END (__novmxsetjmp)
 strong_alias (__novmxsetjmp, __novmx__setjmp)
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.0)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_0)
 
 #endif  /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) ) */
 
@@ -36,4 +36,4 @@ ENTRY (__vmxsetjmp)
 END (__vmxsetjmp)
 strong_alias (__vmxsetjmp, __vmx__setjmp)
 strong_alias (__vmx__setjmp, __setjmp)
-default_symbol_version (__vmxsetjmp,setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
index 96e50de..27166c4 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
@@ -26,14 +26,14 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__longjmp,__longjmp,GLIBC_2.3.4);
+versioned_symbol (libc, __vmx__longjmp, __longjmp, GLIBC_2_3_4);
 # define __longjmp  __vmx__longjmp
 # include "__longjmp-common.S"
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 #  define __NO_VMX__
 #  undef JB_SIZE
-symbol_version (__novmx__longjmp,__longjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx__longjmp, __longjmp, GLIBC_2_0);
 #  undef __longjmp
 #  define __longjmp  __novmx__longjmp
 #  include "__longjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp.S b/sysdeps/powerpc/powerpc32/fpu/setjmp.S
index 60cd350..92acff1 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp.S
@@ -26,7 +26,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -36,7 +36,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc32/mcount.c b/sysdeps/powerpc/powerpc32/mcount.c
index 0476bf6..d8c0632 100644
--- a/sysdeps/powerpc/powerpc32/mcount.c
+++ b/sysdeps/powerpc/powerpc32/mcount.c
@@ -9,7 +9,7 @@
 /* __mcount_internal was added in glibc 2.15 with version GLIBC_PRIVATE,
    but it should have been put in version GLIBC_2.15.  Mark the
    GLIBC_PRIVATE version obsolete and add it to GLIBC_2.16 instead.  */
-default_symbol_version (___mcount_internal, __mcount_internal, GLIBC_2.16);
+versioned_symbol (libc, ___mcount_internal, __mcount_internal, GLIBC_2_16);
 
 #if SHLIB_COMPAT (libc, GLIBC_2_15, GLIBC_2_16)
 strong_alias (___mcount_internal, ___mcount_internal_private);
diff --git a/sysdeps/powerpc/powerpc32/setjmp.S b/sysdeps/powerpc/powerpc32/setjmp.S
index 8a8cf0d..49b64ec 100644
--- a/sysdeps/powerpc/powerpc32/setjmp.S
+++ b/sysdeps/powerpc/powerpc32/setjmp.S
@@ -25,7 +25,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -35,7 +35,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc64/setjmp.S b/sysdeps/powerpc/powerpc64/setjmp.S
index 667b9d1..0a3b2fc 100644
--- a/sysdeps/powerpc/powerpc64/setjmp.S
+++ b/sysdeps/powerpc/powerpc64/setjmp.S
@@ -26,9 +26,9 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmxsetjmp, setjmp, GLIBC_2.3.4)
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define setjmp __vmxsetjmp
 # define _setjmp __vmx_setjmp
 # define __sigsetjmp __vmx__sigsetjmp
@@ -44,9 +44,9 @@ strong_alias (__vmx__sigsetjmp, __setjmp)
 #  undef __sigjmp_save
 #  undef JB_SIZE
 #  define __NO_VMX__
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.3)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.3);
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.3)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_3)
+compat_symbol (libc, __novmx_setjmp,_setjmp, GLIBC_2_3);
+compat_symbol (libc, __novmx__sigsetjmp,__sigsetjmp, GLIBC_2_3)
 #  define setjmp __novmxsetjmp
 #  define _setjmp __novmx_setjmp
 #  define __sigsetjmp __novmx__sigsetjmp

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=836e8fa0fb4d9487df01090f2d7d49772c363346

commit 836e8fa0fb4d9487df01090f2d7d49772c363346
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 14:58:25 2013 -0500

    PowerPC: Fix SIGSTKSZ definition
    
    This copies the sparc version of sigstack.h, which gives powerpc
    
    Before the VSX changes, struct rt_sigframe size was 1920 plus 128 for
    __SIGNAL_FRAMESIZE giving ppc64 exactly the default MINSIGSTKSZ of
    2048.
    
    After VSX, ucontext increased by 256 bytes.  Oops, we're over
    MINSIGSTKSZ, so powerpc has been using the wrong value for quite a
    while.  Add another ucontext for TM and rt_sigframe is now at 3872,
    giving actual MINSIGSTKSZ of 4000.
    
    The glibc testcase that I was looking at was tst-cancel21, which
    allocates 2*SIGSTKSZ (not because the test is trying to be
    conservative, but because the test actually has nested signal stack
    frames).  We blew the allocation by 48 bytes when using current
    mainline gcc to compile glibc (le ppc64).
    
    The required stack depth in _dl_lookup_symbol_x from the top of the
    next signal frame was 10944 bytes.  I guess you'd want to add 288 to
    that, implying an actual SIGSTKSZ of 11232.

diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h b/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h
new file mode 100644
index 0000000..33be9e8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h
@@ -0,0 +1,54 @@
+/* sigstack, sigaltstack definitions.
+   Copyright (C) 1998-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SIGNAL_H
+# error "Never include this file directly.  Use <signal.h> instead"
+#endif
+
+
+/* Structure describing a signal stack (obsolete).  */
+struct sigstack
+  {
+    void *ss_sp;		/* Signal stack pointer.  */
+    int ss_onstack;		/* Nonzero if executing on this stack.  */
+  };
+
+
+/* Possible values for `ss_flags.'.  */
+enum
+{
+  SS_ONSTACK = 1,
+#define SS_ONSTACK	SS_ONSTACK
+  SS_DISABLE
+#define SS_DISABLE	SS_DISABLE
+};
+
+/* Minimum stack size for a signal handler.  */
+#define MINSIGSTKSZ	4096
+
+/* System default stack size.  */
+#define SIGSTKSZ	16384
+
+
+/* Alternate, preferred interface.  */
+typedef struct sigaltstack
+  {
+    void *ss_sp;
+    int ss_flags;
+    size_t ss_size;
+  } stack_t;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e528b6ce9dadcf496c62a20d6cdbb6ff54e5a009

commit e528b6ce9dadcf496c62a20d6cdbb6ff54e5a009
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 14:57:36 2013 -0500

    PowerPC: LE makecontext
    
    Use conditional form of branch and link to avoid destroying the cpu
    link stack used to predict blr return addresses.

diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
index 95902b1..54f44c8 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
@@ -47,7 +47,7 @@ ENTRY(__makecontext)
 #ifdef PIC
 	mflr	r0
 	cfi_register(lr,r0)
-	bl	1f
+	bcl	20,31,1f
 1:	mflr	r6
 	addi	r6,r6,L(exitcode)-1b
 	mtlr	r0
@@ -136,7 +136,7 @@ ENTRY(__novec_makecontext)
 #ifdef PIC
 	mflr	r0
 	cfi_register(lr,r0)
-	bl	1f
+	bcl	20,31,1f
 1:	mflr	r6
 	addi	r6,r6,L(novec_exitcode)-1b
 	mtlr	r0
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
index 4a16669..4b5550b 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
@@ -125,7 +125,7 @@ L(noparms):
   /* If the target function returns we need to do some cleanup.  We use a
      code trick to get the address of our cleanup function into the link
      register.  Do not add any code between here and L(exitcode).  */
-  bl  L(gotexitcodeaddr);
+  bcl	20,31,L(gotexitcodeaddr);
 
 	/* This is the helper code which gets called if a function which
 	   is registered with 'makecontext' returns.  In this case we

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c42fdd75b956add5ab1da1cdd2468b98825ff4aa

commit c42fdd75b956add5ab1da1cdd2468b98825ff4aa
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 14:54:38 2013 -0500

    PowerPC: LE _dl_hwcap access
    
    More LE support, correcting word accesses to _dl_hwcap.

diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
index 6330780..14f39d6 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
@@ -151,15 +151,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #   else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #   endif
 #  else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #  endif
 	andis.	r7,r7,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
index bedebf0..f980d28 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
@@ -79,15 +79,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 # ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 # else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 # endif
 #else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #endif
 
 #ifdef __CONTEXT_ENABLE_FPRS
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
index 21c2e1f..90d7d72 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
@@ -152,15 +152,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 #  ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #  else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #  endif
 # else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 # endif
 
 # ifdef __CONTEXT_ENABLE_VRS
@@ -308,14 +308,14 @@ ENTRY(__CONTEXT_FUNC_NAME)
 	mtlr    r8
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #   else
 	lwz     r7,_dl_hwcap@got(r7)
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #   endif
 #  else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #  endif
 	andis.	r7,r7,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	la	r10,(_UC_VREGS)(r31)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=88aa8fe04f00cdfefb9f0aed36ffdec8cf63c7a8

commit 88aa8fe04f00cdfefb9f0aed36ffdec8cf63c7a8
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 14:53:56 2013 -0500

    PowerPC: IBM long double fixes
    
    Fixes for denormals and other corner cases:
    
    - printf_fphex.c: Testing for the low double exponent being zero is
    unnecessary.  If the difference in exponents is less than 53 then the
    high double exponent must be nearing the low end of its range, and the
    low double exponent hit rock bottom.
    
    - ldbl2mpn.c: A denormal (ie. exponent of zero) value is treated as
    if the exponent was one, so shift left mantissa by one.  Code handling
    normalisation of the low double mantissa lacked a test for shift count
    greater than bits in type being shifted, and lacked anything to handle
    the case where the difference in exponents is less than 53 as in
    printf_fphex.c.  (Why do we have so many copies of this code?)
    
    - math_ldbl.h (ldbl_extract_mantissa): Same as above, but worse, with
    code testing for exponent > 1 for some reason, probably a typo for
    > = 1.
    
    - math_ldbl.h (ldbl_insert_mantissa): Round the high double as per
    mpn2ldbl.c (hi is odd or explicit mantissas non-zero) so that the
    number we return won't change when applying ldbl_canonicalize().
    Add missing overflow checks and normalisation of high mantissa.
    Correct misleading comment: "The hidden bit of the lo mantissa is
    zero" is not always true as can be seen from the code rounding the hi
    mantissa.  Also by inspection, lzcount can never be less than zero so
    remove that test.  Lastly, masking bitfields to their widths can be
    left to the compiler.
    
    - mpn2ldbl.c: The overflow checks here on rounding of high double were
    just plain wrong.  Incrementing the exponent must be accompanied by a
    shift right of the mantissa to keep the value unchanged.  Above notes
    for ldbl_insert_mantissa are also relevant.

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
index 6a72d6a..8885def 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
@@ -243,7 +243,7 @@ int32_t __ieee754_rem_pio2l(long double x, long double *y)
      We split the 113 bits of the mantissa into 5 24bit integers
      stored in a double array.  */
   /* Make the IBM extended format 105 bit mantissa look like the ieee854 112
-     bit mantissa so the next operatation will give the correct result.  */
+     bit mantissa so the next operation will give the correct result.  */
   ldbl_extract_mantissa (&ixd, &lxd, &exp, x);
   exp = exp - 23;
   /* This is faster than doing this in floating point, because we
diff --git a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
index 5149ba1..e46fde7 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
@@ -36,6 +36,7 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   union ibm_extended_long_double u;
   unsigned long long hi, lo;
   int ediff;
+
   u.ld = value;
 
   *is_neg = u.d[0].ieee.negative;
@@ -43,27 +44,36 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 
   lo = ((long long) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
   hi = ((long long) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
-  /* If the lower double is not a denomal or zero then set the hidden
+
+  /* If the lower double is not a denormal or zero then set the hidden
      53rd bit.  */
-  if (u.d[1].ieee.exponent > 0)
-    {
-      lo |= 1LL << 52;
+  if (u.d[1].ieee.exponent != 0)
+    lo |= 1ULL << 52;
+  else
+    lo = lo << 1;
 
-      /* The lower double is normalized separately from the upper.  We may
-	 need to adjust the lower manitissa to reflect this.  */
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
-      if (ediff > 53)
-	lo = lo >> (ediff-53);
+  /* The lower double is normalized separately from the upper.  We may
+     need to adjust the lower manitissa to reflect this.  */
+  ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
+  if (ediff > 0)
+    {
+      if (ediff < 64)
+	lo = lo >> ediff;
+      else
+	lo = 0;
     }
+  else if (ediff < 0)
+    lo = lo << -ediff;
+
   /* The high double may be rounded and the low double reflects the
      difference between the long double and the rounded high double
      value.  This is indicated by a differnce between the signs of the
      high and low doubles.  */
-  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
-      && ((u.d[1].ieee.exponent != 0) && (lo != 0L)))
+  if (u.d[0].ieee.negative != u.d[1].ieee.negative
+      && lo != 0)
     {
       lo = (1ULL << 53) - lo;
-      if (hi == 0LL)
+      if (hi == 0)
 	{
 	  /* we have a borrow from the hidden bit, so shift left 1.  */
 	  hi = 0x0ffffffffffffeLL | (lo >> 51);
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 3036f14..69bb2e6 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -6,85 +6,131 @@
 #include <ieee754.h>
 #include <stdint.h>
 
+/* To suit our callers we return *hi64 and *lo64 as if they came from
+   an ieee854 112 bit mantissa, that is, 48 bits in *hi64 (plus one
+   implicit bit) and 64 bits in *lo64.  */
+
 static inline void
-ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
+ldbl_extract_mantissa (uint64_t *hi64, uint64_t *lo64, int *exp, long double x)
 {
   /* We have 105 bits of mantissa plus one implicit digit.  Since
      106 bits are representable we use the first implicit digit for
      the number before the decimal point and the second implicit bit
      as bit 53 of the mantissa.  */
   uint64_t hi, lo;
-  int ediff;
   union ibm_extended_long_double u;
+
   u.ld = x;
   *exp = u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
   lo = ((uint64_t) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
   hi = ((uint64_t) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
-  /* If the lower double is not a denomal or zero then set the hidden
-     53rd bit.  */
-  if (u.d[1].ieee.exponent > 0x001)
-    {
-      lo |= (1ULL << 52);
-      lo = lo << 7; /* pre-shift lo to match ieee854.  */
-      /* The lower double is normalized separately from the upper.  We
-	 may need to adjust the lower manitissa to reflect this.  */
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
-      if (ediff > 53)
-	lo = lo >> (ediff-53);
-      hi |= (1ULL << 52);
-    }
 
-  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
-      && ((u.d[1].ieee.exponent != 0) && (lo != 0LL)))
+  if (u.d[0].ieee.exponent != 0)
     {
-      hi--;
-      lo = (1ULL << 60) - lo;
-      if (hi < (1ULL << 52))
+      int ediff;
+
+      /* If not a denormal or zero then we have an implicit 53rd bit.  */
+      hi |= (uint64_t) 1 << 52;
+
+      if (u.d[1].ieee.exponent != 0)
+	lo |= (uint64_t) 1 << 52;
+      else
+	/* A denormal is to be interpreted as having a biased exponent
+	   of 1.  */
+	lo = lo << 1;
+
+      /* We are going to shift 4 bits out of hi later, because we only
+	 want 48 bits in *hi64.  That means we want 60 bits in lo, but
+	 we currently only have 53.  Shift the value up.  */
+      lo = lo << 7;
+
+      /* The lower double is normalized separately from the upper.
+	 We may need to adjust the lower mantissa to reflect this.
+	 The difference between the exponents can be larger than 53
+	 when the low double is much less than 1ULP of the upper
+	 (in which case there are significant bits, all 0's or all
+	 1's, between the two significands).  The difference between
+	 the exponents can be less than 53 when the upper double
+	 exponent is nearing its minimum value (in which case the low
+	 double is denormal ie. has an exponent of zero).  */
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
+      if (ediff > 0)
 	{
-	  /* we have a borrow from the hidden bit, so shift left 1.  */
-	  hi = (hi << 1) | (lo >> 59);
-	  lo = 0xfffffffffffffffLL & (lo << 1);
-	  *exp = *exp - 1;
+	  if (ediff < 64)
+	    lo = lo >> ediff;
+	  else
+	    lo = 0;
+	}
+      else if (ediff < 0)
+	lo = lo << -ediff;
+
+      if (u.d[0].ieee.negative != u.d[1].ieee.negative
+	  && lo != 0)
+	{
+	  hi--;
+	  lo = ((uint64_t) 1 << 60) - lo;
+	  if (hi < (uint64_t) 1 << 52)
+	    {
+	      /* We have a borrow from the hidden bit, so shift left 1.  */
+	      hi = (hi << 1) | (lo >> 59);
+	      lo = (((uint64_t) 1 << 60) - 1) & (lo << 1);
+	      *exp = *exp - 1;
+	    }
 	}
     }
+  else
+    /* If the larger magnitude double is denormal then the smaller
+       one must be zero.  */
+    hi = hi << 1;
+
   *lo64 = (hi << 60) | lo;
   *hi64 = hi >> 4;
 }
 
 static inline long double
-ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
+ldbl_insert_mantissa (int sign, int exp, uint64_t hi64, uint64_t lo64)
 {
   union ibm_extended_long_double u;
-  unsigned long hidden2, lzcount;
-  unsigned long long hi, lo;
+  int expnt2;
+  uint64_t hi, lo;
 
   u.d[0].ieee.negative = sign;
   u.d[1].ieee.negative = sign;
   u.d[0].ieee.exponent = exp + IEEE754_DOUBLE_BIAS;
-  u.d[1].ieee.exponent = exp-53 + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = 0;
+  expnt2 = exp - 53 + IEEE754_DOUBLE_BIAS;
+
   /* Expect 113 bits (112 bits + hidden) right justified in two longs.
      The low order 53 bits (52 + hidden) go into the lower double */
-  lo = (lo64 >> 7)& ((1ULL << 53) - 1);
-  hidden2 = (lo64 >> 59) &  1ULL;
+  lo = (lo64 >> 7) & (((uint64_t) 1 << 53) - 1);
   /* The high order 53 bits (52 + hidden) go into the upper double */
-  hi = (lo64 >> 60) & ((1ULL << 11) - 1);
-  hi |= (hi64 << 4);
+  hi = lo64 >> 60;
+  hi |= hi64 << 4;
 
-  if (lo != 0LL)
+  if (lo != 0)
     {
-      /* hidden2 bit of low double controls rounding of the high double.
-	 If hidden2 is '1' then round up hi and adjust lo (2nd mantissa)
+      int lzcount;
+
+      /* hidden bit of low double controls rounding of the high double.
+	 If hidden is '1' and either the explicit mantissa is non-zero
+	 or hi is odd, then round up hi and adjust lo (2nd mantissa)
 	 plus change the sign of the low double to compensate.  */
-      if (hidden2)
+      if ((lo & ((uint64_t) 1 << 52)) != 0
+	  && ((hi & 1) != 0 || (lo & (((uint64_t) 1 << 52) - 1)) != 0))
 	{
 	  hi++;
+	  if ((hi & ((uint64_t) 1 << 53)) != 0)
+	    {
+	      hi = hi >> 1;
+	      u.d[0].ieee.exponent++;
+	    }
 	  u.d[1].ieee.negative = !sign;
-	  lo = (1ULL << 53) - lo;
+	  lo = ((uint64_t) 1 << 53) - lo;
 	}
-      /* The hidden bit of the lo mantissa is zero so we need to
-	 normalize the it for the low double.  Shift it left until the
-	 hidden bit is '1' then adjust the 2nd exponent accordingly.  */
+
+      /* Normalize the low double.  Shift the mantissa left until
+	 the hidden bit is '1' and adjust the exponent accordingly.  */
 
       if (sizeof (lo) == sizeof (long))
 	lzcount = __builtin_clzl (lo);
@@ -92,34 +138,30 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
 	lzcount = __builtin_clzl ((long) (lo >> 32));
       else
 	lzcount = __builtin_clzl ((long) lo) + 32;
-      lzcount = lzcount - 11;
-      if (lzcount > 0)
+      lzcount = lzcount - (64 - 53);
+      lo <<= lzcount;
+      expnt2 -= lzcount;
+
+      if (expnt2 >= 1)
+	/* Not denormal.  */
+	u.d[1].ieee.exponent = expnt2;
+      else
 	{
-	  int expnt2 = u.d[1].ieee.exponent - lzcount;
-	  if (expnt2 >= 1)
-	    {
-	      /* Not denormal.  Normalize and set low exponent.  */
-	      lo = lo << lzcount;
-	      u.d[1].ieee.exponent = expnt2;
-	    }
+	  /* Is denormal.  Note that biased exponent of 0 is treated
+	     as if it was 1, hence the extra shift.  */
+	  if (expnt2 > -53)
+	    lo >>= 1 - expnt2;
 	  else
-	    {
-	      /* Is denormal.  */
-	      lo = lo << (lzcount + expnt2);
-	      u.d[1].ieee.exponent = 0;
-	    }
+	    lo = 0;
 	}
     }
   else
-    {
-      u.d[1].ieee.negative = 0;
-      u.d[1].ieee.exponent = 0;
-    }
+    u.d[1].ieee.negative = 0;
 
-  u.d[1].ieee.mantissa1 = lo & ((1ULL << 32) - 1);
-  u.d[1].ieee.mantissa0 = (lo >> 32) & ((1ULL << 20) - 1);
-  u.d[0].ieee.mantissa1 = hi & ((1ULL << 32) - 1);
-  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
+  u.d[1].ieee.mantissa1 = lo;
+  u.d[1].ieee.mantissa0 = lo >> 32;
+  u.d[0].ieee.mantissa1 = hi;
+  u.d[0].ieee.mantissa0 = hi >> 32;
   return u.ld;
 }
 
@@ -163,13 +205,13 @@ ldbl_canonicalize (double *a, double *aa)
   *aa = xl;
 }
 
-/* Simple inline nearbyint (double) function .
+/* Simple inline nearbyint (double) function.
    Only works in the default rounding mode
    but is useful in long double rounding functions.  */
 static inline double
 ldbl_nearbyint (double a)
 {
-  double two52 = 0x10000000000000LL;
+  double two52 = (uint64_t) 1 << 52;
 
   if (__builtin_expect ((__builtin_fabs (a) < two52), 1))
     {
diff --git a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
index c3e42f2..c96852d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
@@ -69,9 +69,9 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
       else
 	lzcount = __builtin_clzl ((long) val) + 32;
       if (hi)
-	lzcount = lzcount - 11;
+	lzcount = lzcount - (64 - 53);
       else
-	lzcount = lzcount + 42;
+	lzcount = lzcount + 53 - (64 - 53);
 
       if (lzcount > u.d[0].ieee.exponent)
 	{
@@ -97,29 +97,27 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	}
     }
 
-  if (lo != 0L)
+  if (lo != 0)
     {
-      /* hidden2 bit of low double controls rounding of the high double.
-	 If hidden2 is '1' and either the explicit mantissa is non-zero
+      /* hidden bit of low double controls rounding of the high double.
+	 If hidden is '1' and either the explicit mantissa is non-zero
 	 or hi is odd, then round up hi and adjust lo (2nd mantissa)
 	 plus change the sign of the low double to compensate.  */
       if ((lo & (1LL << 52)) != 0
-	  && ((hi & 1) != 0 || (lo & ((1LL << 52) - 1))))
+	  && ((hi & 1) != 0 || (lo & ((1LL << 52) - 1)) != 0))
 	{
 	  hi++;
-	  if ((hi & ((1LL << 52) - 1)) == 0)
+	  if ((hi & (1LL << 53)) != 0)
 	    {
-	      if ((hi & (1LL << 53)) != 0)
-		hi -= 1LL << 52;
+	      hi >>= 1;
 	      u.d[0].ieee.exponent++;
 	    }
 	  u.d[1].ieee.negative = !sign;
 	  lo = (1LL << 53) - lo;
 	}
 
-      /* The hidden bit of the lo mantissa is zero so we need to normalize
-	 it for the low double.  Shift it left until the hidden bit is '1'
-	 then adjust the 2nd exponent accordingly.  */
+      /* Normalize the low double.  Shift the mantissa left until
+	 the hidden bit is '1' and adjust the exponent accordingly.  */
 
       if (sizeof (lo) == sizeof (long))
 	lzcount = __builtin_clzl (lo);
@@ -127,24 +125,24 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	lzcount = __builtin_clzl ((long) (lo >> 32));
       else
 	lzcount = __builtin_clzl ((long) lo) + 32;
-      lzcount = lzcount - 11;
-      if (lzcount > 0)
-	{
-	  lo = lo << lzcount;
-	  exponent2 = exponent2 - lzcount;
-	}
+      lzcount = lzcount - (64 - 53);
+      lo <<= lzcount;
+      exponent2 -= lzcount;
+
       if (exponent2 > 0)
 	u.d[1].ieee.exponent = exponent2;
-      else
+      else if (exponent2 > -53)
 	lo >>= 1 - exponent2;
+      else
+	lo = 0;
     }
   else
     u.d[1].ieee.negative = 0;
 
-  u.d[1].ieee.mantissa1 = lo & 0xffffffffLL;
-  u.d[1].ieee.mantissa0 = (lo >> 32) & 0xfffff;
-  u.d[0].ieee.mantissa1 = hi & 0xffffffffLL;
-  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
+  u.d[1].ieee.mantissa1 = lo;
+  u.d[1].ieee.mantissa0 = lo >> 32;
+  u.d[0].ieee.mantissa1 = hi;
+  u.d[0].ieee.mantissa0 = hi >> 32;
 
   return u.ld;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 3fe8333..453c2be 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -42,15 +42,15 @@ do {									      \
 	lo <<= 1;							      \
       /* The lower double is normalized separately from the upper.  We	      \
 	 may need to adjust the lower manitissa to reflect this.  */	      \
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;		      \
-      if (ediff > 53 + 63)						      \
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;		      \
+      if (ediff > 63)							      \
 	lo = 0;								      \
-      else if (ediff > 53)						      \
-	lo = lo >> (ediff - 53);					      \
-      else if (u.d[1].ieee.exponent == 0 && ediff < 53)			      \
-	lo = lo << (53 - ediff);					      \
+      else if (ediff > 0)						      \
+	lo = lo >> ediff;						      \
+      else if (ediff < 0)						      \
+	lo = lo << -ediff;						      \
       if (u.d[0].ieee.negative != u.d[1].ieee.negative			      \
-	  && (u.d[1].ieee.exponent != 0 || lo != 0L))			      \
+	  && lo != 0)							      \
 	{								      \
 	  lo = (1ULL << 60) - lo;					      \
 	  if (hi == 0L)							      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c2e294dfaab1002d10dd677aba6f16abccf80f73

commit c2e294dfaab1002d10dd677aba6f16abccf80f73
Author: Alan Modra <amodra@gmail.com>
Date:   Mon Sep 2 14:48:33 2013 -0500

    PowerPC: IBM long double fixes
    
    This patch starts the process of supporting powerpc64 little-endian
    long double in glibc.  IBM long double is an array of two ieee
    doubles, so making union ibm_extended_long_double reflect this fact is
    the correct way to access fields of the doubles IMO, rather than
    adding a little-endian version of the current union.  That of course
    results in a much larger but mechanical patch.

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c b/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
index 1eaf2fe..49121ca 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
@@ -36,9 +36,9 @@ __ieee754_exp10l (long double arg)
   else if (arg > LDBL_MAX_10_EXP + 1)
     return LDBL_MAX * LDBL_MAX;
 
-  u.d = arg;
-  arg_high = u.dd[0];
-  arg_low = u.dd[1];
+  u.ld = arg;
+  arg_high = u.d[0].d;
+  arg_low = u.d[1].d;
   exp_high = arg_high * log10_high;
   exp_low = arg_high * log10_low + arg_low * M_LN10l;
   return __ieee754_expl (exp_high) * __ieee754_expl (exp_low);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_expl.c b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
index 1b994cd..f7c50bf 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_expl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
@@ -162,39 +162,39 @@ __ieee754_expl (long double x)
       x = x + xl;
 
       /* Compute ex2 = 2^n_0 e^(argtable[tval1]) e^(argtable[tval2]).  */
-      ex2_u.d = __expl_table[T_EXPL_RES1 + tval1]
-		* __expl_table[T_EXPL_RES2 + tval2];
+      ex2_u.ld = (__expl_table[T_EXPL_RES1 + tval1]
+		  * __expl_table[T_EXPL_RES2 + tval2]);
       n_i = (int)n;
       /* 'unsafe' is 1 iff n_1 != 0.  */
       unsafe = fabsl(n_i) >= -LDBL_MIN_EXP - 1;
-      ex2_u.ieee.exponent += n_i >> unsafe;
+      ex2_u.d[0].ieee.exponent += n_i >> unsafe;
       /* Fortunately, there are no subnormal lowpart doubles in
 	 __expl_table, only normal values and zeros.
 	 But after scaling it can be subnormal.  */
-      exponent2 = ex2_u.ieee.exponent2 + (n_i >> unsafe);
-      if (ex2_u.ieee.exponent2 == 0)
-	/* assert ((ex2_u.ieee.mantissa2|ex2_u.ieee.mantissa3) == 0) */;
+      exponent2 = ex2_u.d[1].ieee.exponent + (n_i >> unsafe);
+      if (ex2_u.d[1].ieee.exponent == 0)
+	/* assert ((ex2_u.d[1].ieee.mantissa0|ex2_u.d[1].ieee.mantissa1) == 0) */;
       else if (exponent2 > 0)
-	ex2_u.ieee.exponent2 = exponent2;
+	ex2_u.d[1].ieee.exponent = exponent2;
       else if (exponent2 <= -54)
 	{
-	  ex2_u.ieee.exponent2 = 0;
-	  ex2_u.ieee.mantissa2 = 0;
-	  ex2_u.ieee.mantissa3 = 0;
+	  ex2_u.d[1].ieee.exponent = 0;
+	  ex2_u.d[1].ieee.mantissa0 = 0;
+	  ex2_u.d[1].ieee.mantissa1 = 0;
 	}
       else
 	{
 	  static const double
 	    two54 = 1.80143985094819840000e+16, /* 4350000000000000 */
 	    twom54 = 5.55111512312578270212e-17; /* 3C90000000000000 */
-	  ex2_u.dd[1] *= two54;
-	  ex2_u.ieee.exponent2 += n_i >> unsafe;
-	  ex2_u.dd[1] *= twom54;
+	  ex2_u.d[1].d *= two54;
+	  ex2_u.d[1].ieee.exponent += n_i >> unsafe;
+	  ex2_u.d[1].d *= twom54;
 	}
 
       /* Compute scale = 2^n_1.  */
-      scale_u.d = 1.0L;
-      scale_u.ieee.exponent += n_i - (n_i >> unsafe);
+      scale_u.ld = 1.0L;
+      scale_u.d[0].ieee.exponent += n_i - (n_i >> unsafe);
 
       /* Approximate e^x2 - 1, using a seventh-degree polynomial,
 	 with maximum error in [-2^-16-2^-53,2^-16+2^-53]
@@ -204,7 +204,7 @@ __ieee754_expl (long double x)
       /* Return result.  */
       fesetenv (&oldenv);
 
-      result = x22 * ex2_u.d + ex2_u.d;
+      result = x22 * ex2_u.ld + ex2_u.ld;
 
       /* Now we can test whether the result is ultimate or if we are unsure.
 	 In the later case we should probably call a mpn based routine to give
@@ -238,7 +238,7 @@ __ieee754_expl (long double x)
       if (!unsafe)
 	return result;
       else
-	return result * scale_u.d;
+	return result * scale_u.ld;
     }
   /* Exceptional cases:  */
   else if (isless (x, himark))
diff --git a/sysdeps/ieee754/ldbl-128ibm/ieee754.h b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
index 9e94f53..0778b1f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ieee754.h
+++ b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
@@ -179,49 +179,10 @@ union ieee854_long_double
 
 union ibm_extended_long_double
   {
-    long double d;
-    double dd[2];
-
-    /* This is the IBM extended format long double.  */
-    struct
-      { /* Big endian.  There is no other.  */
-
-	unsigned int negative:1;
-	unsigned int exponent:11;
-	/* Together Mantissa0-3 comprise the mantissa.  */
-	unsigned int mantissa0:20;
-	unsigned int mantissa1:32;
-
-	unsigned int negative2:1;
-	unsigned int exponent2:11;
-	/* There is an implied 1 here?  */
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa2:20;
-	unsigned int mantissa3:32;
-      } ieee;
-
-    /* This format makes it easier to see if a NaN is a signalling NaN.  */
-    struct
-      { /* Big endian.  There is no other.  */
-
-	unsigned int negative:1;
-	unsigned int exponent:11;
-	unsigned int quiet_nan:1;
-	/* Together Mantissa0-3 comprise the mantissa.  */
-	unsigned int mantissa0:19;
-	unsigned int mantissa1:32;
-
-	unsigned int negative2:1;
-	unsigned int exponent2:11;
-	/* There is an implied 1 here?  */
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa2:20;
-	unsigned int mantissa3:32;
-      } ieee_nan;
+    long double ld;
+    union ieee754_double d[2];
    };
 
-#define IBM_EXTENDED_LONG_DOUBLE_BIAS 0x3ff /* Added to exponent.  */
-
 __END_DECLS
 
 #endif /* ieee754.h */
diff --git a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
index 00e44b8..5149ba1 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
@@ -36,22 +36,22 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   union ibm_extended_long_double u;
   unsigned long long hi, lo;
   int ediff;
-  u.d = value;
+  u.ld = value;
 
-  *is_neg = u.ieee.negative;
-  *expt = (int) u.ieee.exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  *is_neg = u.d[0].ieee.negative;
+  *expt = (int) u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
-  lo = ((long long) u.ieee.mantissa2 << 32) | u.ieee.mantissa3;
-  hi = ((long long) u.ieee.mantissa0 << 32) | u.ieee.mantissa1;
+  lo = ((long long) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
+  hi = ((long long) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
   /* If the lower double is not a denomal or zero then set the hidden
      53rd bit.  */
-  if (u.ieee.exponent2 > 0)
+  if (u.d[1].ieee.exponent > 0)
     {
       lo |= 1LL << 52;
 
       /* The lower double is normalized separately from the upper.  We may
 	 need to adjust the lower manitissa to reflect this.  */
-      ediff = u.ieee.exponent - u.ieee.exponent2;
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
       if (ediff > 53)
 	lo = lo >> (ediff-53);
     }
@@ -59,8 +59,8 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
      difference between the long double and the rounded high double
      value.  This is indicated by a differnce between the signs of the
      high and low doubles.  */
-  if ((u.ieee.negative != u.ieee.negative2)
-      && ((u.ieee.exponent2 != 0) && (lo != 0L)))
+  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
+      && ((u.d[1].ieee.exponent != 0) && (lo != 0L)))
     {
       lo = (1ULL << 53) - lo;
       if (hi == 0LL)
@@ -92,7 +92,7 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 #define NUM_LEADING_ZEROS (BITS_PER_MP_LIMB \
 			   - (LDBL_MANT_DIG - ((N - 1) * BITS_PER_MP_LIMB)))
 
-  if (u.ieee.exponent == 0)
+  if (u.d[0].ieee.exponent == 0)
     {
       /* A biased exponent of zero is a special case.
 	 Either it is a zero or it is a denormal number.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 046293e..3036f14 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -15,28 +15,28 @@ ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
      as bit 53 of the mantissa.  */
   uint64_t hi, lo;
   int ediff;
-  union ibm_extended_long_double eldbl;
-  eldbl.d = x;
-  *exp = eldbl.ieee.exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  union ibm_extended_long_double u;
+  u.ld = x;
+  *exp = u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
-  lo = ((int64_t)eldbl.ieee.mantissa2 << 32) | eldbl.ieee.mantissa3;
-  hi = ((int64_t)eldbl.ieee.mantissa0 << 32) | eldbl.ieee.mantissa1;
+  lo = ((uint64_t) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
+  hi = ((uint64_t) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
   /* If the lower double is not a denomal or zero then set the hidden
      53rd bit.  */
-  if (eldbl.ieee.exponent2 > 0x001)
+  if (u.d[1].ieee.exponent > 0x001)
     {
       lo |= (1ULL << 52);
       lo = lo << 7; /* pre-shift lo to match ieee854.  */
       /* The lower double is normalized separately from the upper.  We
 	 may need to adjust the lower manitissa to reflect this.  */
-      ediff = eldbl.ieee.exponent - eldbl.ieee.exponent2;
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
       if (ediff > 53)
 	lo = lo >> (ediff-53);
       hi |= (1ULL << 52);
     }
 
-  if ((eldbl.ieee.negative != eldbl.ieee.negative2)
-      && ((eldbl.ieee.exponent2 != 0) && (lo != 0LL)))
+  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
+      && ((u.d[1].ieee.exponent != 0) && (lo != 0LL)))
     {
       hi--;
       lo = (1ULL << 60) - lo;
@@ -59,10 +59,10 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
   unsigned long hidden2, lzcount;
   unsigned long long hi, lo;
 
-  u.ieee.negative = sign;
-  u.ieee.negative2 = sign;
-  u.ieee.exponent = exp + IBM_EXTENDED_LONG_DOUBLE_BIAS;
-  u.ieee.exponent2 = exp-53 + IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  u.d[0].ieee.negative = sign;
+  u.d[1].ieee.negative = sign;
+  u.d[0].ieee.exponent = exp + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = exp-53 + IEEE754_DOUBLE_BIAS;
   /* Expect 113 bits (112 bits + hidden) right justified in two longs.
      The low order 53 bits (52 + hidden) go into the lower double */
   lo = (lo64 >> 7)& ((1ULL << 53) - 1);
@@ -79,7 +79,7 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
       if (hidden2)
 	{
 	  hi++;
-	  u.ieee.negative2 = !sign;
+	  u.d[1].ieee.negative = !sign;
 	  lo = (1ULL << 53) - lo;
 	}
       /* The hidden bit of the lo mantissa is zero so we need to
@@ -95,32 +95,32 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
       lzcount = lzcount - 11;
       if (lzcount > 0)
 	{
-	  int expnt2 = u.ieee.exponent2 - lzcount;
+	  int expnt2 = u.d[1].ieee.exponent - lzcount;
 	  if (expnt2 >= 1)
 	    {
 	      /* Not denormal.  Normalize and set low exponent.  */
 	      lo = lo << lzcount;
-	      u.ieee.exponent2 = expnt2;
+	      u.d[1].ieee.exponent = expnt2;
 	    }
 	  else
 	    {
 	      /* Is denormal.  */
 	      lo = lo << (lzcount + expnt2);
-	      u.ieee.exponent2 = 0;
+	      u.d[1].ieee.exponent = 0;
 	    }
 	}
     }
   else
     {
-      u.ieee.negative2 = 0;
-      u.ieee.exponent2 = 0;
+      u.d[1].ieee.negative = 0;
+      u.d[1].ieee.exponent = 0;
     }
 
-  u.ieee.mantissa3 = lo & ((1ULL << 32) - 1);
-  u.ieee.mantissa2 = (lo >> 32) & ((1ULL << 20) - 1);
-  u.ieee.mantissa1 = hi & ((1ULL << 32) - 1);
-  u.ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
-  return u.d;
+  u.d[1].ieee.mantissa1 = lo & ((1ULL << 32) - 1);
+  u.d[1].ieee.mantissa0 = (lo >> 32) & ((1ULL << 20) - 1);
+  u.d[0].ieee.mantissa1 = hi & ((1ULL << 32) - 1);
+  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
+  return u.ld;
 }
 
 /* Handy utility functions to pack/unpack/cononicalize and find the nearbyint
@@ -129,18 +129,18 @@ static inline long double
 default_ldbl_pack (double a, double aa)
 {
   union ibm_extended_long_double u;
-  u.dd[0] = a;
-  u.dd[1] = aa;
-  return u.d;
+  u.d[0].d = a;
+  u.d[1].d = aa;
+  return u.ld;
 }
 
 static inline void
 default_ldbl_unpack (long double l, double *a, double *aa)
 {
   union ibm_extended_long_double u;
-  u.d = l;
-  *a = u.dd[0];
-  *aa = u.dd[1];
+  u.ld = l;
+  *a = u.d[0].d;
+  *aa = u.d[1].d;
 }
 
 #ifndef ldbl_pack
diff --git a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
index 3df42c5..c3e42f2 100644
--- a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
@@ -33,11 +33,11 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
   unsigned long long hi, lo;
   int exponent2;
 
-  u.ieee.negative = sign;
-  u.ieee.negative2 = sign;
-  u.ieee.exponent = expt + IBM_EXTENDED_LONG_DOUBLE_BIAS;
-  u.ieee.exponent2 = 0;
-  exponent2 = expt - 53 + IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  u.d[0].ieee.negative = sign;
+  u.d[1].ieee.negative = sign;
+  u.d[0].ieee.exponent = expt + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = 0;
+  exponent2 = expt - 53 + IEEE754_DOUBLE_BIAS;
 
 #if BITS_PER_MP_LIMB == 32
   /* The low order 53 bits (52 + hidden) go into the lower double */
@@ -73,15 +73,15 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
       else
 	lzcount = lzcount + 42;
 
-      if (lzcount > u.ieee.exponent)
+      if (lzcount > u.d[0].ieee.exponent)
 	{
-	  lzcount = u.ieee.exponent;
-	  u.ieee.exponent = 0;
+	  lzcount = u.d[0].ieee.exponent;
+	  u.d[0].ieee.exponent = 0;
 	  exponent2 -= lzcount;
 	}
       else
 	{
-	  u.ieee.exponent -= (lzcount - 1);
+	  u.d[0].ieee.exponent -= (lzcount - 1);
 	  exponent2 -= (lzcount - 1);
 	}
 
@@ -111,9 +111,9 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	    {
 	      if ((hi & (1LL << 53)) != 0)
 		hi -= 1LL << 52;
-	      u.ieee.exponent++;
+	      u.d[0].ieee.exponent++;
 	    }
-	  u.ieee.negative2 = !sign;
+	  u.d[1].ieee.negative = !sign;
 	  lo = (1LL << 53) - lo;
 	}
 
@@ -134,17 +134,17 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	  exponent2 = exponent2 - lzcount;
 	}
       if (exponent2 > 0)
-	u.ieee.exponent2 = exponent2;
+	u.d[1].ieee.exponent = exponent2;
       else
 	lo >>= 1 - exponent2;
     }
   else
-    u.ieee.negative2 = 0;
+    u.d[1].ieee.negative = 0;
 
-  u.ieee.mantissa3 = lo & 0xffffffffLL;
-  u.ieee.mantissa2 = (lo >> 32) & 0xfffff;
-  u.ieee.mantissa1 = hi & 0xffffffffLL;
-  u.ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
+  u.d[1].ieee.mantissa1 = lo & 0xffffffffLL;
+  u.d[1].ieee.mantissa0 = (lo >> 32) & 0xfffff;
+  u.d[0].ieee.mantissa1 = hi & 0xffffffffLL;
+  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
 
-  return u.d;
+  return u.ld;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 247dc20..3fe8333 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -26,31 +26,31 @@ do {									      \
       unsigned long long int num0, num1;				      \
       unsigned long long hi, lo;					      \
       int ediff;							      \
-      union ibm_extended_long_double eldbl;				      \
-      eldbl.d = fpnum.ldbl.d;						      \
+      union ibm_extended_long_double u;					      \
+      u.ld = fpnum.ldbl.d;						      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
-      lo = ((long long)eldbl.ieee.mantissa2 << 32) | eldbl.ieee.mantissa3;    \
-      hi = ((long long)eldbl.ieee.mantissa0 << 32) | eldbl.ieee.mantissa1;    \
+      lo = ((long long)u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;  \
+      hi = ((long long)u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;  \
       lo <<= 7; /* pre-shift lo to match ieee854.  */			      \
       /* If the lower double is not a denomal or zero then set the hidden     \
 	 53rd bit.  */							      \
-      if (eldbl.ieee.exponent2 != 0)					      \
+      if (u.d[1].ieee.exponent != 0)					      \
 	lo |= (1ULL << (52 + 7));					      \
       else								      \
 	lo <<= 1;							      \
       /* The lower double is normalized separately from the upper.  We	      \
 	 may need to adjust the lower manitissa to reflect this.  */	      \
-      ediff = eldbl.ieee.exponent - eldbl.ieee.exponent2;		      \
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;		      \
       if (ediff > 53 + 63)						      \
 	lo = 0;								      \
       else if (ediff > 53)						      \
 	lo = lo >> (ediff - 53);					      \
-      else if (eldbl.ieee.exponent2 == 0 && ediff < 53)			      \
+      else if (u.d[1].ieee.exponent == 0 && ediff < 53)			      \
 	lo = lo << (53 - ediff);					      \
-      if (eldbl.ieee.negative != eldbl.ieee.negative2			      \
-	  && (eldbl.ieee.exponent2 != 0 || lo != 0L))			      \
+      if (u.d[0].ieee.negative != u.d[1].ieee.negative			      \
+	  && (u.d[1].ieee.exponent != 0 || lo != 0L))			      \
 	{								      \
 	  lo = (1ULL << 60) - lo;					      \
 	  if (hi == 0L)							      \
@@ -58,7 +58,7 @@ do {									      \
 	      /* we have a borrow from the hidden bit, so shift left 1.  */   \
 	      hi = 0xffffffffffffeLL | (lo >> 59);			      \
 	      lo = 0xfffffffffffffffLL & (lo << 1);			      \
-	      eldbl.ieee.exponent--;					      \
+	      u.d[0].ieee.exponent--;					      \
 	    }								      \
 	  else								      \
 	    hi--;							      \
@@ -109,9 +109,9 @@ do {									      \
 	  *--wnumstr = L'0';						      \
 	}								      \
 									      \
-      leading = eldbl.ieee.exponent == 0 ? '0' : '1';			      \
+      leading = u.d[0].ieee.exponent == 0 ? '0' : '1';			      \
 									      \
-      exponent = eldbl.ieee.exponent;					      \
+      exponent = u.d[0].ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
@@ -121,18 +121,18 @@ do {									      \
 	    {								      \
 	      /* This is a denormalized number.  */			      \
 	      expnegative = 1;						      \
-	      exponent = IBM_EXTENDED_LONG_DOUBLE_BIAS - 1;		      \
+	      exponent = IEEE754_DOUBLE_BIAS - 1;			      \
 	    }								      \
 	}								      \
-      else if (exponent >= IBM_EXTENDED_LONG_DOUBLE_BIAS)		      \
+      else if (exponent >= IEEE754_DOUBLE_BIAS)				      \
 	{								      \
 	  expnegative = 0;						      \
-	  exponent -= IBM_EXTENDED_LONG_DOUBLE_BIAS;			      \
+	  exponent -= IEEE754_DOUBLE_BIAS;				      \
 	}								      \
       else								      \
 	{								      \
 	  expnegative = 1;						      \
-	  exponent = -(exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS);	      \
+	  exponent = -(exponent - IEEE754_DOUBLE_BIAS);			      \
 	}								      \
 } while (0)
 
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c b/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
index bfcd110..92ced52 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
@@ -34,11 +34,11 @@ __nearbyintl (long double x)
   fenv_t env;
   static const long double TWO52 = 4503599627370496.0L;
   union ibm_extended_long_double u;
-  u.d = x;
+  u.ld = x;
 
-  if (fabs (u.dd[0]) < TWO52)
+  if (fabs (u.d[0].d) < TWO52)
     {
-      double high = u.dd[0];
+      double high = u.d[0].d;
       feholdexcept (&env);
       if (high > 0.0)
 	{
@@ -52,13 +52,13 @@ __nearbyintl (long double x)
 	  high += TWO52;
           if (high == 0.0) high = -0.0;
 	}
-      u.dd[0] = high;
-      u.dd[1] = 0.0;
-      math_force_eval (u.dd[0]);
-      math_force_eval (u.dd[1]);
+      u.d[0].d = high;
+      u.d[1].d = 0.0;
+      math_force_eval (u.d[0]);
+      math_force_eval (u.d[1]);
       fesetenv (&env);
     }
-  else if (fabs (u.dd[1]) < TWO52 && u.dd[1] != 0.0)
+  else if (fabs (u.d[1].d) < TWO52 && u.d[1].d != 0.0)
     {
       double high, low, tau;
       /* In this case we have to round the low double and handle any
@@ -67,57 +67,57 @@ __nearbyintl (long double x)
          may already be rounded and the low double may have the
          opposite sign to compensate.  */
       feholdexcept (&env);
-      if (u.dd[0] > 0.0)
+      if (u.d[0].d > 0.0)
 	{
-	  if (u.dd[1] > 0.0)
+	  if (u.d[1].d > 0.0)
 	    {
 	      /* If the high/low doubles are the same sign then simply
 	         round the low double.  */
-	      high = u.dd[0];
-	      low = u.dd[1];
+	      high = u.d[0].d;
+	      low = u.d[1].d;
 	    }
-	  else if (u.dd[1] < 0.0)
+	  else if (u.d[1].d < 0.0)
 	    {
 	      /* Else the high double is pre rounded and we need to
 	         adjust for that.  */
 
-	      tau = __nextafter (u.dd[0], 0.0);
-	      tau = (u.dd[0] - tau) * 2.0;
-	      high = u.dd[0] - tau;
-	      low = u.dd[1] + tau;
+	      tau = __nextafter (u.d[0].d, 0.0);
+	      tau = (u.d[0].d - tau) * 2.0;
+	      high = u.d[0].d - tau;
+	      low = u.d[1].d + tau;
 	    }
 	  low += TWO52;
 	  low -= TWO52;
 	}
-      else if (u.dd[0] < 0.0)
+      else if (u.d[0].d < 0.0)
 	{
-	  if (u.dd[1] < 0.0)
+	  if (u.d[1].d < 0.0)
 	    {
 	      /* If the high/low doubles are the same sign then simply
 	         round the low double.  */
-	      high = u.dd[0];
-	      low = u.dd[1];
+	      high = u.d[0].d;
+	      low = u.d[1].d;
 	    }
-	  else if (u.dd[1] > 0.0)
+	  else if (u.d[1].d > 0.0)
 	    {
 	      /* Else the high double is pre rounded and we need to
 	         adjust for that.  */
-	      tau = __nextafter (u.dd[0], 0.0);
-	      tau = (u.dd[0] - tau) * 2.0;
-	      high = u.dd[0] - tau;
-	      low = u.dd[1] + tau;
+	      tau = __nextafter (u.d[0].d, 0.0);
+	      tau = (u.d[0].d - tau) * 2.0;
+	      high = u.d[0].d - tau;
+	      low = u.d[1].d + tau;
 	    }
 	  low = TWO52 - low;
 	  low = -(low - TWO52);
 	}
-      u.dd[0] = high + low;
-      u.dd[1] = high - u.dd[0] + low;
-      math_force_eval (u.dd[0]);
-      math_force_eval (u.dd[1]);
+      u.d[0].d = high + low;
+      u.d[1].d = high - u.d[0].d + low;
+      math_force_eval (u.d[0]);
+      math_force_eval (u.d[1]);
       fesetenv (&env);
     }
 
-  return u.d;
+  return u.ld;
 }
 
 long_double_symbol (libm, __nearbyintl, nearbyintl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
index 04e3288..93a80c5 100644
--- a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
@@ -43,11 +43,11 @@ libc_hidden_proto (STRTOF)
 #define FLOAT_HUGE_VAL	HUGE_VALL
 # define SET_MANTISSA(flt, mant) \
   do { union ibm_extended_long_double u;				      \
-       u.d = (flt);							      \
-       u.ieee_nan.mantissa0 = (mant) >> 32;				      \
-       u.ieee_nan.mantissa1 = (mant);					      \
-       if ((u.ieee.mantissa0 | u.ieee.mantissa1) != 0)			      \
-	 (flt) = u.d;							      \
+       u.ld = (flt);							      \
+       u.d[0].ieee_nan.mantissa0 = (mant) >> 32;			      \
+       u.d[0].ieee_nan.mantissa1 = (mant);				      \
+       if ((u.d[0].ieee.mantissa0 | u.d[0].ieee.mantissa1) != 0)	      \
+	 (flt) = u.ld;							      \
   } while (0)
 
 #include <strtod_l.c>
diff --git a/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c b/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
index ed0d4a5..06dcf02 100644
--- a/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
@@ -89,23 +89,23 @@ __x2y2m1l (long double x, long double y)
   double vals[12];
   SET_RESTORE_ROUND (FE_TONEAREST);
   union ibm_extended_long_double xu, yu;
-  xu.d = x;
-  yu.d = y;
-  if (fabs (xu.dd[1]) < 0x1p-500)
-    xu.dd[1] = 0.0;
-  if (fabs (yu.dd[1]) < 0x1p-500)
-    yu.dd[1] = 0.0;
-  mul_split (&vals[1], &vals[0], xu.dd[0], xu.dd[0]);
-  mul_split (&vals[3], &vals[2], xu.dd[0], xu.dd[1]);
+  xu.ld = x;
+  yu.ld = y;
+  if (fabs (xu.d[1].d) < 0x1p-500)
+    xu.d[1].d = 0.0;
+  if (fabs (yu.d[1].d) < 0x1p-500)
+    yu.d[1].d = 0.0;
+  mul_split (&vals[1], &vals[0], xu.d[0].d, xu.d[0].d);
+  mul_split (&vals[3], &vals[2], xu.d[0].d, xu.d[1].d);
   vals[2] *= 2.0;
   vals[3] *= 2.0;
-  mul_split (&vals[5], &vals[4], xu.dd[1], xu.dd[1]);
-  mul_split (&vals[7], &vals[6], yu.dd[0], yu.dd[0]);
-  mul_split (&vals[9], &vals[8], yu.dd[0], yu.dd[1]);
+  mul_split (&vals[5], &vals[4], xu.d[1].d, xu.d[1].d);
+  mul_split (&vals[7], &vals[6], yu.d[0].d, yu.d[0].d);
+  mul_split (&vals[9], &vals[8], yu.d[0].d, yu.d[1].d);
   vals[8] *= 2.0;
   vals[9] *= 2.0;
-  mul_split (&vals[11], &vals[10], yu.dd[1], yu.dd[1]);
-  if (xu.dd[0] >= 0.75)
+  mul_split (&vals[11], &vals[10], yu.d[1].d, yu.d[1].d);
+  if (xu.d[0].d >= 0.75)
     vals[1] -= 1.0;
   else
     {

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]