aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/powerpc64/mode64
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/powerpc64/mode64')
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm189
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm225
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm43
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm43
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm187
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm132
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm146
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm196
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm135
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm274
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm187
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm77
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h82
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm88
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm164
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm270
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm132
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm117
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm168
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm708
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h179
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h214
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h219
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm185
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h160
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm589
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm135
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm128
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm43
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm43
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm129
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm67
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm146
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h175
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h171
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm53
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm112
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm106
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm130
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm193
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm179
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm64
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm143
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h254
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm126
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm181
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm415
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm555
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm173
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm863
50 files changed, 9693 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm
new file mode 100644
index 0000000..0e8474f
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm
@@ -0,0 +1,189 @@
+dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.5
+C POWER4/PPC970 2
+C POWER5 2
+C POWER6 2.63
+C POWER7 2.25-2.87
+
+C This code is a little bit slower for POWER3/PPC630 than the simple code used
+C previously, but it is much faster for POWER4/PPC970. The reason for the
+C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
+C registers.
+
+C INPUT PARAMETERS
+C rp r3
+C up r4
+C vp r5
+C n r6
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+
+ rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi r6, r6, 3 C compute count...
+ srdi r6, r6, 2 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(r4) C load s1 limb
+ ld r9, 0(r5) C load s2 limb
+ ld r10, 8(r4) C load s1 limb
+ ld r11, 8(r5) C load s2 limb
+ ld r12, 16(r4) C load s1 limb
+ addi r4, r4, 24
+ ld r0, 16(r5) C load s2 limb
+ addi r5, r5, 24
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(r3)
+ std r30, 8(r3)
+ std r31, 16(r3)
+ addi r3, r3, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(r4) C load s1 limb
+ addi r4, r4, 8
+ ld r0, 0(r5) C load s2 limb
+ addi r5, r5, 8
+ ADDSUBC r31, r0, r12 C add
+ std r31, 0(r3)
+ addi r3, r3, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(r4) C load s1 limb
+ ld r11, 0(r5) C load s2 limb
+ ld r12, 8(r4) C load s1 limb
+ addi r4, r4, 16
+ ld r0, 8(r5) C load s2 limb
+ addi r5, r5, 16
+ ADDSUBC r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(r3)
+ std r31, 8(r3)
+ addi r3, r3, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): C INITCY C clear/set cy
+L(go): ld r6, 0(r4) C load s1 limb
+ ld r7, 0(r5) C load s2 limb
+ ld r8, 8(r4) C load s1 limb
+ ld r9, 8(r5) C load s2 limb
+ ld r10, 16(r4) C load s1 limb
+ ld r11, 16(r5) C load s2 limb
+ ld r12, 24(r4) C load s1 limb
+ ld r0, 24(r5) C load s2 limb
+ bdz L(end)
+
+ addi r4, r4, 32
+ addi r5, r5, 32
+
+ ALIGN(16)
+L(top): ADDSUBC r28, r7, r6
+ ld r6, 0(r4) C load s1 limb
+ ld r7, 0(r5) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(r4) C load s1 limb
+ ld r9, 8(r5) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(r4) C load s1 limb
+ ld r11, 16(r5) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(r4) C load s1 limb
+ ld r0, 24(r5) C load s2 limb
+ std r28, 0(r3)
+ addi r4, r4, 32
+ std r29, 8(r3)
+ addi r5, r5, 32
+ std r30, 16(r3)
+ std r31, 24(r3)
+ addi r3, r3, 32
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r7, r6
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(r3)
+ std r29, 8(r3)
+ std r30, 16(r3)
+ std r31, 24(r3)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm
new file mode 100644
index 0000000..0c12f9b
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm
@@ -0,0 +1,225 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 6-18 6-18
+C POWER4/PPC970 8 8.3
+C POWER5 8 8.25
+C POWER6 16.25 16.75
+C POWER7 3.77 4.9
+
+C TODO
+C * Try to reduce the number of needed live registers
+C * Add support for _1c entry points
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(SM, `')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(SM, `$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ std r30, -16(r1)
+ cmpdi cr6, r0, 2
+ std r29, -24(r1)
+ addi n, n, 3 C compute count...
+ std r28, -32(r1)
+ srdi n, n, 2 C ...for ctr
+ std r27, -40(r1)
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r9, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+ ALIGN(16)
+L(b00): ld r9, 0(up)
+ ld r27, 8(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r7, r7, r5
+ addze r12, r8
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+ ALIGN(16)
+L(b01): bdnz L(gt1)
+ ld r9, 0(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ ADDSUB r0, r0, r11
+ std r0, 0(rp)
+SM(` subfe r11, r11, r11 ')
+SM(` addic r11, r11, 1 ')
+ addze r3, r8
+ blr
+L(gt1): ld r9, 0(up)
+ ld r27, 8(up)
+ mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r5
+ adde r11, r11, r8
+ addze r12, r10
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+ ADDSUBC r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+SM(` subfe r11, r11, r11 ')
+ b L(bot)
+
+L(b10): addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r9, 0(up)
+ ld r27, 8(up)
+ bdz L(end)
+ addi up, up, 16
+
+ ALIGN(16)
+L(top): mulld r0, r9, r6
+ mulhdu r5, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r5 C 5 7
+ mulld r5, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r5, r5, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ ADDSUB r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ ADDSUBC r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ ADDSUBC r5, r5, r30 C 5 30
+ std r5, 16(rp) C 5
+ ADDSUBC r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+SM(` subfe r11, r11, r11 ')
+ addi rp, rp, 32
+L(bot):
+SM(` addic r11, r11, 1 ')
+ bdnz L(top)
+
+L(end): mulld r0, r9, r6
+ mulhdu r5, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r5
+ addze r8, r8
+ ADDSUB r0, r0, r28
+ std r0, 0(rp)
+ ADDSUBC r7, r7, r29
+ std r7, 8(rp)
+SM(` subfe r11, r11, r11 ')
+SM(` addic r11, r11, 1 ')
+ addze r3, r8
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..2c5400a
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..447791a
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..6158f54
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm
@@ -0,0 +1,187 @@
+dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C POWER3/PPC630 1.83 (1.5 c/l should be possible)
+C POWER4/PPC970 3 (2.0 c/l should be possible)
+C POWER5 3
+C POWER6 3.5-47
+C POWER7 3
+
+C STATUS
+C * Try combining upx+up, and vpx+vp.
+C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
+C greater than the 2nd operand. Yes, this addition is non-commutative wrt
+C performance.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`DO_add', `
+ define(`ADDSUBC', `addc $1, $2, $3')
+ define(`ADDSUBE', `adde $1, $2, $3')
+ define(INITCY, `addic $1, r1, 0')
+ define(RETVAL, `addze r3, $1')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADDSUBC', `subfc $1, $2, $3')
+ define(`ADDSUBE', `subfe $1, $2, $3')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `subfze r3, $1
+ neg r3, r3')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADDSUBC', `subfc $1, $3, $2')
+ define(`ADDSUBE', `subfe $1, $3, $2')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `addme r3, $1')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+define(`rpx', `r6')
+define(`upx', `r7')
+define(`vpx', `r12')
+
+define(`s0', `r0') define(`s1', `r9')
+define(`u0', `r8')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+ cmpldi cr0, n, 13
+ bgt L(big)
+
+ mtctr n C copy n in ctr
+ INITCY( r0) C clear cy
+
+ ld v0, 0(vp) C load v limb
+ ld u0, 0(up) C load u limb
+ addi up, up, -8 C update up
+ addi rp, rp, -8 C update rp
+ sldi s1, v0, LSH
+ bdz L(ex1) C If done, skip loop
+
+ ALIGN(16)
+L(lo0): ld v1, 8(vp) C load v limb
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+ ldu u0, 16(up) C load u limb and update up
+ srdi s0, v0, RSH C shift down previous v limb
+ std s1, 8(rp) C store result limb
+ rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb
+ bdz L(ex0) C decrement ctr and exit if done
+ ldu v0, 16(vp) C load v limb and update vp
+ ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
+ ld u0, 8(up) C load u limb
+ srdi s1, v1, RSH C shift down previous v limb
+ stdu s0, 16(rp) C store result limb and update rp
+ rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb
+ bdnz L(lo0) C decrement ctr and loop back
+
+L(ex1): ADDSUBE(r7, s1, u0)
+ std r7, 8(rp) C store last result limb
+ srdi r0, v0, RSH
+ RETVAL( r0)
+ blr
+L(ex0): ADDSUBE(r7, s0, u0)
+ std r7, 16(rp) C store last result limb
+ srdi r0, v1, RSH
+ RETVAL( r0)
+ blr
+
+
+L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0
+ addi r6, n, -1 C ...for ctr
+ srdi r6, r6, 1 C ...for ctr
+ mtctr r6 C copy count into ctr
+ beq cr0, L(b0)
+
+L(b1): ld v1, 0(vp)
+ ld u0, 0(up)
+ sldi s1, v1, LSH
+ srdi s0, v1, RSH
+ ld v0, 8(vp)
+ ADDSUBC(s1, s1, u0) C add limbs without cy, set cy
+ addi rpx, rp, -16
+ addi rp, rp, -8
+ sub upx, up, rp
+ sub vpx, vp, rp
+ sub up, up, rpx
+ sub vp, vp, rpx
+ addi up, up, 8
+ addi upx, upx, 16
+ addi vp, vp, 16
+ addi vpx, vpx, 24
+ b L(mid)
+
+L(b0): ld v0, 0(vp)
+ ld u0, 0(up)
+ sldi s0, v0, LSH
+ srdi s1, v0, RSH
+ ld v1, 8(vp)
+ ADDSUBC(s0, s0, u0) C add limbs without cy, set cy
+ addi rpx, rp, -8
+ addi rp, rp, -16
+ sub upx, up, rpx
+ sub vpx, vp, rpx
+ sub up, up, rp
+ sub vp, vp, rp
+ addi up, up, 8
+ addi upx, upx, 16
+ addi vp, vp, 16
+ addi vpx, vpx, 24
+
+ ALIGN(32)
+L(top): ldx u0, rp, up
+ ldx v0, rp, vp
+ rldimi s1, v1, LSH, 0
+ stdu s0, 16(rp)
+ srdi s0, v1, RSH
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+L(mid): ldx u0, rpx, upx
+ ldx v1, rpx, vpx
+ rldimi s0, v0, LSH, 0
+ stdu s1, 16(rpx)
+ srdi s1, v0, RSH
+ ADDSUBE(s0, s0, u0) C add limbs with cy, set cy
+ bdnz L(top) C decrement CTR and loop back
+
+ ldx u0, rp, up
+ rldimi s1, v1, LSH, 0
+ std s0, 16(rp)
+ srdi s0, v1, RSH
+ ADDSUBE(s1, s1, u0) C add limbs with cy, set cy
+ std s1, 24(rp)
+
+ RETVAL( s0)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..45cded9
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm
@@ -0,0 +1,132 @@
+dnl PPC64 mpn_bdiv_dbm1c.
+
+dnl Copyright 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8.25
+C POWER5 8.5 fluctuating as function of n % 3
+C POWER6 15
+C POWER7 4.75
+
+C TODO
+C * Nothing to do...
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`bd', `r6')
+define(`cy', `r7')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+ ld r0, 0(r4)
+
+ rldicl. r12, r5, 0,62
+ cmpldi cr6, r12, 2
+ cmpldi cr7, r5, 4
+ addi r5, r5, 1
+ srwi r5, r5, 2
+ mtctr r5
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+ ALIGN(16)
+L(b11): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ ld r0, 8(r4)
+ addi r4, r4, -24
+ addi r3, r3, -24
+ b L(3)
+
+ ALIGN(16)
+L(b00): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ addi r4, r4, -16
+ addi r3, r3, -16
+ b L(0)
+
+ ALIGN(16)
+L(b01): mulld r5, r0, r6
+ mulhdu r12, r0, r6
+ addi r3, r3, -8
+ ble cr7, L(e1)
+ ld r0, 8(r4)
+ addi r4, r4, -8
+ b L(1)
+
+ ALIGN(16)
+L(b10): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ ble cr7, L(e2)
+
+ ALIGN(16)
+L(top): subfc r11, r9, r7
+ ld r10, 8(r4)
+ ld r0, 16(r4)
+ subfe r7, r8, r11
+ std r11, 0(r3)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+L(1): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ subfc r11, r5, r7
+ subfe r7, r12, r11
+ std r11, 8(r3)
+L(0): subfc r11, r9, r7
+ ld r10, 24(r4)
+ ld r0, 32(r4)
+ subfe r7, r8, r11
+ std r11, 16(r3)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+L(3): mulld r9, r0, r6
+ mulhdu r8, r0, r6
+ subfc r11, r5, r7
+ subfe r7, r12, r11
+ std r11, 24(r3)
+ addi r4, r4, 32
+ addi r3, r3, 32
+ bdnz L(top)
+
+L(e2): ld r10, 8(r4)
+ mulld r5, r10, r6
+ mulhdu r12, r10, r6
+ subfc r11, r9, r7
+ subfe r7, r8, r11
+ std r11, 0(r3)
+L(e1): subfc r11, r5, r7
+ std r11, 8(r3)
+ subfe r3, r12, r11
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm
new file mode 100644
index 0000000..307aafc
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm
@@ -0,0 +1,146 @@
+dnl PowerPC-64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb
+dnl divisor.
+
+dnl Copyright 2006, 2010, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
+C POWER8 12 12
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`d', `r6')
+define(`di', `r7')
+define(`cnt',`r8')
+
+define(`tnc',`r10')
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_bdiv_q_1,toc)
+ addi r7, n, -1
+ cmpdi cr1, n, 1
+ ld r12, 0(up)
+ li cnt, 0
+ neg r0, d
+ and r0, d, r0
+ cntlzd r0, r0
+ subfic cnt, r0, 63
+ srd d, d, cnt
+L(7):
+ mtctr r7
+ LEA( r10, binvert_limb_table)
+ rldicl r11, d, 63, 57
+ lbzx r0, r10, r11
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r0, r9, r0
+ mulld r10, r0, r0
+ sldi r0, r0, 1
+ mulld r10, d, r10
+ subf r0, r10, r0
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf di, r9, r0 C di = 1/d mod 2^64
+ifdef(`AIX',
+` C For AIX it is not clear how to jump into another function.
+ b .mpn_pi1_bdiv_q_1
+',`
+ C For non-AIX, dispatch into the pi1 variant.
+ bne cr0, L(norm)
+ b L(unorm)
+')
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+ cmpdi cr0, cnt, 0
+ ld r12, 0(up)
+ addic r0, n, -1 C set carry as side effect
+ cmpdi cr1, n, 1
+ mtctr r0
+ beq cr0, L(norm)
+
+L(unorm):
+ subfic tnc, cnt, 64 C set carry as side effect
+ li r5, 0
+ srd r11, r12, cnt
+ beq cr1, L(ed1)
+
+ ALIGN(16)
+L(tpu): ld r12, 8(up)
+ nop
+ addi up, up, 8
+ sld r0, r12, tnc
+ or r11, r11, r0
+ subfe r9, r5, r11
+ srd r11, r12, cnt
+ mulld r0, di, r9
+ mulhdu r5, r0, d
+ std r0, 0(rp)
+ addi rp, rp, 8
+ bdnz L(tpu)
+
+ subfe r11, r5, r11
+L(ed1): mulld r0, di, r11
+ std r0, 0(rp)
+ blr
+
+ ALIGN(16)
+L(norm):
+ mulld r11, r12, di
+ mulhdu r5, r11, d
+ std r11, 0(rp)
+ beqlr cr1
+
+ ALIGN(16)
+L(tpn): ld r9, 8(up)
+ addi up, up, 8
+ subfe r5, r5, r9
+ mulld r11, di, r5
+ mulhdu r5, r11, d C result not used in last iteration
+ std r11, 8(rp)
+ addi rp, rp, 8
+ bdnz L(tpn)
+
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm
new file mode 100644
index 0000000..24968c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm
@@ -0,0 +1,196 @@
+dnl PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.25
+C POWER5 ?
+C POWER6 3
+C POWER7 2
+
+C INPUT PARAMETERS
+define(`cnd', `r3')
+define(`rp', `r4')
+define(`up', `r5')
+define(`vp', `r6')
+define(`n', `r7')
+
+ifdef(`OPERATION_cnd_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_cnd_add_n)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_cnd_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_cnd_sub_n)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ subfic cnd, cnd, 0
+ subfe cnd, cnd, cnd
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r8, 0(up) C load s1 limb
+ ld r9, 0(vp) C load s2 limb
+ ld r10, 8(up) C load s1 limb
+ ld r11, 8(vp) C load s2 limb
+ ld r12, 16(up) C load s1 limb
+ addi up, up, 24
+ ld r0, 16(vp) C load s2 limb
+ addi vp, vp, 24
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r29, 0(rp)
+ std r30, 8(rp)
+ std r31, 16(rp)
+ addi rp, rp, 24
+ bdnz L(go)
+ b L(ret)
+
+L(b01): ld r12, 0(up) C load s1 limb
+ addi up, up, 8
+ ld r0, 0(vp) C load s2 limb
+ addi vp, vp, 8
+ and r0, r0, cnd
+ ADDSUB r31, r0, r12 C add
+ std r31, 0(rp)
+ addi rp, rp, 8
+ bdnz L(go)
+ b L(ret)
+
+L(b10): ld r10, 0(up) C load s1 limb
+ ld r11, 0(vp) C load s2 limb
+ ld r12, 8(up) C load s1 limb
+ addi up, up, 16
+ ld r0, 8(vp) C load s2 limb
+ addi vp, vp, 16
+ and r11, r11, cnd
+ and r0, r0, cnd
+ ADDSUB r30, r11, r10 C add
+ ADDSUBC r31, r0, r12 C add
+ std r30, 0(rp)
+ std r31, 8(rp)
+ addi rp, rp, 16
+ bdnz L(go)
+ b L(ret)
+
+L(b00): CLRCB C clear/set cy
+L(go): ld r7, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdz L(end)
+
+ addi up, up, 32
+ addi vp, vp, 32
+
+L(top): ADDSUBC r28, r27, r7
+ ld r7, 0(up) C load s1 limb
+ ld r27, 0(vp) C load s2 limb
+ ADDSUBC r29, r9, r8
+ ld r8, 8(up) C load s1 limb
+ ld r9, 8(vp) C load s2 limb
+ ADDSUBC r30, r11, r10
+ ld r10, 16(up) C load s1 limb
+ ld r11, 16(vp) C load s2 limb
+ ADDSUBC r31, r0, r12
+ ld r12, 24(up) C load s1 limb
+ ld r0, 24(vp) C load s2 limb
+ std r28, 0(rp)
+ addi up, up, 32
+ std r29, 8(rp)
+ addi vp, vp, 32
+ std r30, 16(rp)
+ std r31, 24(rp)
+ addi rp, rp, 32
+ and r27, r27, cnd
+ and r9, r9, cnd
+ and r11, r11, cnd
+ and r0, r0, cnd
+ bdnz L(top) C decrement ctr and loop back
+
+L(end): ADDSUBC r28, r27, r7
+ ADDSUBC r29, r9, r8
+ ADDSUBC r30, r11, r10
+ ADDSUBC r31, r0, r12
+ std r28, 0(rp)
+ std r29, 8(rp)
+ std r30, 16(rp)
+ std r31, 24(rp)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+
+ subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm
new file mode 100644
index 0000000..c2d10bd
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm
@@ -0,0 +1,135 @@
+dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2006, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16 16
+C POWER6 37 46
+C POWER7 12 12
+C POWER8 12 12
+
+C TODO
+C * Check if n=1 code is really an improvement. It probably isn't.
+C * Make more similar to mode1o.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`d', `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_divexact_1,toc)
+ addic. n, n, -1
+ ld r12, 0(up)
+ bne cr0, L(2)
+ divdu r0, r12, d
+ std r0, 0(rp)
+ blr
+L(2):
+ rldicl. r0, d, 0, 63
+ li r10, 0
+ bne cr0, L(7)
+ neg r0, d
+ and r0, d, r0
+ cntlzd r0, r0
+ subfic r0, r0, 63
+ rldicl r10, r0, 0, 32
+ srd d, d, r0
+L(7):
+ mtctr n
+ LEA( r5, binvert_limb_table)
+ rldicl r11, d, 63, 57
+ lbzx r0, r5, r11
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r0, r9, r0
+ mulld r5, r0, r0
+ sldi r0, r0, 1
+ mulld r5, d, r5
+ subf r0, r5, r0
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r7, r9, r0 C r7 = 1/d mod 2^64
+
+ bne cr0, L(norm)
+ subfic r8, r10, 64 C set carry as side effect
+ li r5, 0
+ srd r11, r12, r10
+
+ ALIGN(16)
+L(loop0):
+ ld r12, 8(up)
+ nop
+ addi up, up, 8
+ sld r0, r12, r8
+ or r11, r11, r0
+ subfe r9, r5, r11
+ srd r11, r12, r10
+ mulld r0, r7, r9
+ mulhdu r5, r0, d
+ std r0, 0(rp)
+ addi rp, rp, 8
+ bdnz L(loop0)
+
+ subfe r0, r5, r11
+ mulld r0, r7, r0
+ std r0, 0(rp)
+ blr
+
+ ALIGN(16)
+L(norm):
+ mulld r11, r12, r7
+ mulhdu r5, r11, d
+ std r11, 0(rp)
+ ALIGN(16)
+L(loop1):
+ ld r9, 8(up)
+ addi up, up, 8
+ subfe r5, r5, r9
+ mulld r11, r7, r5
+ mulhdu r5, r11, d C result not used in last iteration
+ std r11, 8(rp)
+ addi rp, rp, 8
+ bdnz L(loop1)
+
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm
new file mode 100644
index 0000000..b283877
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm
@@ -0,0 +1,274 @@
+dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11 outdated figures
+C POWER4/PPC970 28 28 19
+C POWER5 29 29 ~19
+C POWER6 49 59 ~42
+C POWER7 24.5 23 ~14
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C d = r7
+
+C We use a not very predictable branch in the frac code, therefore the cycle
+C count wobbles somewhat. With the alternative branch-free code, things run
+C considerably slower on POWER4/PPC970 and POWER5.
+
+C Add preinv entry point.
+
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_1,toc)
+
+ mfcr r12
+ add. r10, r6, r4
+ std r25, -56(r1)
+ mr r25, r4
+ mflr r0
+ std r26, -48(r1)
+ mr r26, r5
+ std r28, -32(r1)
+ mr r28, r6
+ std r29, -24(r1)
+ mr r29, r3
+ li r3, 0
+ std r30, -16(r1)
+ mr r30, r7
+ std r31, -8(r1)
+ li r31, 0
+ std r27, -40(r1)
+ std r0, 16(r1)
+ stw r12, 8(r1)
+ stdu r1, -176(r1)
+ beq- cr0, L(1)
+ cmpdi cr7, r7, 0
+ sldi r0, r10, 3
+ add r11, r0, r29
+ addi r29, r11, -8
+ blt- cr7, L(162)
+ cmpdi cr4, r6, 0
+ beq+ cr4, L(71)
+L(163):
+ sldi r9, r6, 3
+ add r9, r9, r5
+ ld r7, -8(r9)
+ cmpld cr7, r7, r30
+ bge- cr7, L(71)
+ cmpdi cr7, r10, 1
+ li r0, 0
+ mr r31, r7
+ std r0, -8(r11)
+ addi r29, r29, -8
+ mr r3, r7
+ beq- cr7, L(1)
+ addi r28, r6, -1
+ cmpdi cr4, r28, 0
+L(71):
+ cntlzd r27, r30
+ sld r30, r30, r27
+ sld r31, r31, r27
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ beq- cr4, L(110)
+ sldi r9, r28, 3
+ addic. r6, r28, -2
+ add r9, r9, r26
+ subfic r5, r27, 64
+ ld r8, -8(r9)
+ srd r0, r8, r5
+ or r31, r31, r0
+ sld r7, r8, r27
+ blt- cr0, L(154)
+ addi r28, r28, -1
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(uloop):
+ ldx r8, r26, r6
+ nop
+ mulld r0, r31, r3
+ mulhdu r10, r31, r3
+ addi r11, r31, 1
+ srd r9, r8, r5
+ addi r6, r6, -8
+ or r9, r7, r9
+ addc r0, r0, r9
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r9
+ subfc r0, r31, r0 C r <= ql
+ subfe r0, r0, r0 C r0 = -(r <= ql)
+ and r9, r30, r0
+ add r31, r31, r9
+ add r10, r0, r10 C qh -= (r >= ql)
+ cmpld cr7, r31, r30
+ bge- cr7, L(164)
+L(123):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ sld r7, r8, r27
+ bdnz L(uloop)
+L(154):
+ addi r11, r31, 1
+ nop
+ mulld r0, r31, r3
+ mulhdu r8, r31, r3
+ addc r0, r0, r7
+ adde r8, r8, r11
+ mulld r31, r8, r30
+ subf r31, r31, r7
+ subfc r0, r0, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r8, r7, r8 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+ cmpld cr7, r31, r30
+ bge- cr7, L(165)
+L(134):
+ std r8, 0(r29)
+ addi r29, r29, -8
+L(110):
+ addic. r0, r25, -1
+ blt- cr0, L(156)
+ mtctr r25
+ neg r9, r30
+ ALIGN(16)
+L(ufloop):
+ addi r11, r31, 1
+ nop
+ mulld r0, r3, r31
+ mulhdu r10, r3, r31
+ add r10, r10, r11
+ mulld r31, r9, r10
+ifelse(0,1,`
+ subfc r0, r0, r31
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+',`
+ cmpld cr7, r31, r0
+ blt cr7, L(29)
+ add r31, r30, r31
+ addi r10, r10, -1
+L(29):
+')
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(ufloop)
+L(156):
+ srd r3, r31, r27
+L(1):
+ addi r1, r1, 176
+ ld r0, 16(r1)
+ lwz r12, 8(r1)
+ mtlr r0
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ mtcrf 8, r12
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(162):
+ cmpdi cr7, r6, 0
+ beq- cr7, L(8)
+ sldi r9, r6, 3
+ addi r29, r29, -8
+ add r9, r9, r5
+ addi r28, r6, -1
+ ld r31, -8(r9)
+ subfc r9, r7, r31
+ li r9, 0
+ adde r9, r9, r9
+ neg r0, r9
+ std r9, -8(r11)
+ and r0, r0, r7
+ subf r31, r0, r31
+L(8):
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ li r27, 0
+ addic. r6, r28, -1
+ blt- cr0, L(110)
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(nloop):
+ addi r11, r31, 1
+ ldx r8, r26, r6
+ mulld r0, r31, r3
+ mulhdu r10, r31, r3
+ addi r6, r6, -8
+ addc r0, r0, r8
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r8 C r = nl - qh * d
+ subfc r0, r31, r0 C r <= ql
+ subfe r0, r0, r0 C r0 = -(r <= ql)
+ and r9, r30, r0
+ add r31, r31, r9
+ add r10, r0, r10 C qh -= (r >= ql)
+ cmpld cr7, r31, r30
+ bge- cr7, L(167)
+L(51):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(nloop)
+ b L(110)
+
+L(164):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(123)
+L(167):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(51)
+L(165):
+ subf r31, r30, r31
+ addi r8, r8, 1
+ b L(134)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm
new file mode 100644
index 0000000..752c3d6
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm
@@ -0,0 +1,187 @@
+dnl PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm frac
+C POWER3/PPC630
+C POWER4/PPC970 ? ?
+C POWER5 37 ?
+C POWER6 62 ?
+C POWER6 30.5 ?
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C dp = r7
+
+
+ifdef(`DARWIN',,`
+define(`r2',`r31')') C FIXME!
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_2,toc)
+ mflr r0
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ std r0, 16(r1)
+ stdu r1, -192(r1)
+ mr r24, r3
+ mr r25, r4
+ sldi r0, r6, 3
+ add r26, r5, r0
+ addi r26, r26, -24
+ ld r30, 8(r7)
+ ld r28, 0(r7)
+ ld r29, 16(r26)
+ ld r31, 8(r26)
+
+ifelse(0,1,`
+ li r23, 0
+ cmpld cr7, r29, r30
+ blt cr7, L(8)
+ bgt cr7, L(9)
+ cmpld cr0, r31, r28
+ blt cr0, L(8)
+L(9): subfc r31, r28, r31
+ subfe r29, r30, r29
+ li r23, 1
+',`
+ li r23, 0
+ cmpld cr7, r29, r30
+ blt cr7, L(8)
+ mfcr r0
+ rlwinm r0, r0, 30, 31, 31
+ subfc r9, r28, r31
+ addze. r0, r0
+ nop
+ beq cr0, L(8)
+ subfc r31, r28, r31
+ subfe r29, r30, r29
+ li r23, 1
+')
+
+L(8):
+ add r27, r25, r6
+ addic. r27, r27, -3
+ blt cr0, L(18)
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ mulld r10, r3, r30
+ mulhdu r0, r3, r28
+ addc r8, r10, r28
+ subfe r11, r1, r1
+ addc r10, r8, r0
+ addze. r11, r11
+ blt cr0, L(91)
+L(40):
+ subfc r10, r30, r10
+ addme. r11, r11
+ addi r3, r3, -1
+ bge cr0, L(40)
+L(91):
+ addi r5, r27, 1
+ mtctr r5
+ sldi r0, r27, 3
+ add r24, r24, r0
+ ALIGN(16)
+L(loop):
+ mulhdu r8, r29, r3
+ mulld r6, r29, r3
+ addc r6, r6, r31
+ adde r8, r8, r29
+ cmpd cr7, r27, r25
+ mulld r0, r30, r8
+ mulhdu r11, r28, r8
+ mulld r10, r28, r8
+ subf r31, r0, r31
+ li r7, 0
+ blt cr7, L(60)
+ ld r7, 0(r26)
+ addi r26, r26, -8
+ nop
+L(60): subfc r7, r28, r7
+ subfe r31, r30, r31
+ subfc r7, r10, r7
+ subfe r4, r11, r31
+ subfc r9, r6, r4
+ subfe r9, r1, r1
+ andc r6, r28, r9
+ andc r0, r30, r9
+ addc r31, r7, r6
+ adde r29, r4, r0
+ subf r8, r9, r8
+ cmpld cr7, r29, r30
+ bge- cr7, L(fix)
+L(bck): std r8, 0(r24)
+ addi r24, r24, -8
+ addi r27, r27, -1
+ bdnz L(loop)
+L(18):
+ std r31, 8(r26)
+ std r29, 16(r26)
+ mr r3, r23
+ addi r1, r1, 192
+ ld r0, 16(r1)
+ mtlr r0
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(fix):
+ mfcr r0
+ rlwinm r0, r0, 30, 31, 31
+ subfc r9, r28, r31
+ addze. r0, r0
+ beq cr0, L(bck)
+ subfc r31, r28, r31
+ subfe r29, r30, r29
+ addi r8, r8, 1
+ b L(bck)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm
new file mode 100644
index 0000000..f9792e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm
@@ -0,0 +1,77 @@
+dnl PowerPC-64 mpn_gcd_11.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 ?
+C POWER4/PPC970 8.5 obsolete
+C POWER5 ?
+C POWER6 ?
+C POWER7 9.4 obsolete
+C POWER8 ?
+C POWER9 ?
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0', `r3')
+define(`v0', `r4')
+
+define(`mask', `r0')dnl
+define(`a1', `r4')dnl
+define(`a2', `r5')dnl
+define(`d1', `r6')dnl
+define(`d2', `r7')dnl
+define(`cnt', `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+ li r12, 63
+ mr r8, v0
+ subf. r10, u0, v0 C r10 = d - a
+ beq L(end)
+
+ ALIGN(16)
+L(top): subfc r11, r8, r3 C r11 = a - d
+ and d2, r11, r10
+ subfe mask, mask, mask
+ cntlzd cnt, d2
+ and a1, r10, mask C d - a
+ andc a2, r11, mask C a - d
+ and d1, r3, mask C a
+ andc d2, r8, mask C d
+ or r3, a1, a2 C new a
+ subf cnt, cnt, r12
+ or r8, d1, d2 C new d
+ srd r3, r3, cnt
+ subf. r10, r3, r8 C r10 = d - a
+ bne L(top)
+
+L(end): blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h
new file mode 100644
index 0000000..f8305f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h
@@ -0,0 +1,82 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1600MHz PPC970 */
+
+/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */
+
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 93
+#define MUL_TOOM44_THRESHOLD 135
+
+#define SQR_BASECASE_THRESHOLD 6
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 74
+#define SQR_TOOM4_THRESHOLD 136
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 234
+
+#define DIV_SB_PREINV_THRESHOLD 0 /* always */
+#define DIV_DC_THRESHOLD 33
+#define POWM_THRESHOLD 89
+
+#define MATRIX22_STRASSEN_THRESHOLD 15
+#define HGCD_THRESHOLD 93
+#define GCD_DC_THRESHOLD 237
+#define GCDEXT_DC_THRESHOLD 273
+#define JACOBI_BASE_METHOD 1
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_1_THRESHOLD 6
+#define MOD_1_2_THRESHOLD 9
+#define MOD_1_4_THRESHOLD 23
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1713
+
+#define MUL_FFT_TABLE { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD 304
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_TABLE { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD 272
+#define SQR_FFT_THRESHOLD 2688
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm
new file mode 100644
index 0000000..dfdba64
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm
@@ -0,0 +1,88 @@
+dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb (approximate)
+C POWER3/PPC630 80
+C POWER4/PPC970 86
+C POWER5 86
+C POWER6 170
+C POWER7 66
+
+ASM_START()
+PROLOGUE(mpn_invert_limb,toc)
+ LEAL( r12, approx_tab)
+ srdi r9, r3, 32
+ rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe
+ srdi r10, r3, 24 C d >> 24
+ lis r11, 0x1000
+ rldicl r8, r3, 0, 63 C d mod 2
+ addi r10, r10, 1 C d40
+ sldi r11, r11, 32 C 2^60
+ srdi r7, r3, 1 C d/2
+ add r7, r7, r8 C d63 = ceil(d/2)
+ neg r8, r8 C mask = -(d mod 2)
+ lhzx r0, r9, r12
+ mullw r9, r0, r0 C v0*v0
+ sldi r6, r0, 11 C v0 << 11
+ addi r0, r6, -1 C (v0 << 11) - 1
+ mulld r9, r9, r10 C v0*v0*d40
+ srdi r9, r9, 40 C v0*v0*d40 >> 40
+ subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+ mulld r0, r9, r10 C v1*d40
+ sldi r6, r9, 13 C v1 << 13
+ subf r0, r0, r11 C 2^60 - v1*d40
+ mulld r0, r0, r9 C v1 * (2^60 - v1*d40)
+ srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47
+ add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+ mulld r11, r0, r7 C v2 * d63
+ srdi r10, r0, 1 C v2 >> 1
+ sldi r9, r0, 31 C v2 << 31
+ and r8, r10, r8 C (v2 >> 1) & mask
+ subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63
+ mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
+ srdi r0, r0, 1 C p1 >> 1
+ add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1)
+ nop
+ mulld r11, r0, r3
+ mulhdu r9, r0, r3
+ addc r10, r11, r3
+ adde r3, r9, r3
+ subf r3, r3, r0
+ blr
+EPILOGUE()
+
+DEF_OBJECT(approx_tab)
+forloop(i,256,512-1,dnl
+` .short eval(0x7fd00/i)
+')dnl
+END_OBJECT(approx_tab)
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm
new file mode 100644
index 0000000..8733730
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm
@@ -0,0 +1,164 @@
+dnl PowerPC-64 mpn_mod_1_1p
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 17
+C POWER5 16
+C POWER6 30
+C POWER7 10.2
+
+C TODO
+C * Optimise, in particular the cps function. This was compiler-generated and
+C then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1_1p)
+ sldi r10, r4, 3
+ addi r4, r4, -1
+ add r3, r3, r10
+ ld r0, 16(r6) C B1modb
+ ld r12, 24(r6) C B2modb
+ ld r9, -8(r3)
+ ld r10, -16(r3)
+ mtctr r4
+ mulhdu r8, r9, r0
+ mulld r7, r9, r0
+ addc r11, r7, r10
+ addze r9, r8
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r4, -24(r3)
+ addi r3, r3, -8
+ nop
+ mulld r10, r11, r0
+ mulld r8, r9, r12
+ mulhdu r11, r11, r0
+ mulhdu r9, r9, r12
+ addc r7, r10, r4
+ addze r10, r11
+ addc r11, r8, r7
+ adde r9, r9, r10
+ bdnz L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` lwz r0, 8(r6)',
+` lwz r0, 12(r6)')
+ ld r3, 0(r6)
+ cmpdi cr7, r0, 0
+ beq- cr7, L(4)
+ subfic r10, r0, 64
+ sld r9, r9, r0
+ srd r10, r11, r10
+ or r9, r10, r9
+L(4): subfc r10, r5, r9
+ subfe r10, r10, r10
+ nand r10, r10, r10
+ sld r11, r11, r0
+ and r10, r10, r5
+ subf r9, r10, r9
+ mulhdu r10, r9, r3
+ mulld r3, r9, r3
+ addi r9, r9, 1
+ addc r8, r3, r11
+ adde r3, r10, r9
+ mulld r3, r3, r5
+ subf r3, r3, r11
+ cmpld cr7, r8, r3
+ bge cr7, L(5) C FIXME: Make branch-less
+ add r3, r3, r5
+L(5): cmpld cr7, r3, r5
+ bge- cr7, L(10)
+ srd r3, r3, r0
+ blr
+
+L(10): subf r3, r5, r3
+ srd r3, r3, r0
+ blr
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps,toc)
+ mflr r0
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ cntlzd r31, r4
+ std r0, 16(r1)
+ extsw r31, r31
+ mr r29, r3
+ stdu r1, -144(r1)
+ sld r30, r4, r31
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ cmpdi cr7, r31, 0
+ neg r0, r30
+ beq- cr7, L(13)
+ subfic r11, r31, 64
+ li r0, 1
+ neg r9, r30
+ srd r11, r3, r11
+ sld r0, r0, r31
+ or r0, r11, r0
+ mulld r0, r0, r9
+L(13): mulhdu r9, r0, r3
+ mulld r11, r0, r3
+ add r9, r0, r9
+ nor r9, r9, r9
+ mulld r9, r9, r30
+ cmpld cr7, r11, r9
+ bge cr7, L(14)
+ add r9, r9, r30
+L(14): addi r1, r1, 144
+ srd r0, r0, r31
+ std r31, 8(r29)
+ std r3, 0(r29)
+ std r0, 16(r29)
+ ld r0, 16(r1)
+ srd r9, r9, r31
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ std r9, 24(r29)
+ ld r29, -24(r1)
+ mtlr r0
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm
new file mode 100644
index 0000000..0b7d6bf
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm
@@ -0,0 +1,270 @@
+dnl PowerPC-64 mpn_mod_1s_4p
+
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 9
+C POWER5 9
+C POWER6 13
+C POWER7 3.5
+
+C TODO
+C * Optimise, in particular the cps function. This was compiler-generated and
+C then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1s_4p)
+ std r23, -72(r1)
+ ld r23, 48(cps)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ ld r24, 32(cps)
+ ld r25, 24(cps)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ ld r26, 16(cps)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ ld r30, 40(cps)
+
+ rldicl. r0, n, 0,62
+ sldi r31, n, 3
+ add ap, ap, r31 C make ap point at end of operand
+
+ cmpdi cr7, r0, 2
+ beq cr0, L(b00)
+ blt cr7, L(b01)
+ beq cr7, L(b10)
+
+L(b11): ld r11, -16(ap)
+ ld r9, -8(ap)
+ ld r0, -24(ap)
+ mulhdu r27, r11, r26
+ mulld r8, r11, r26
+ mulhdu r11, r9, r25
+ mulld r9, r9, r25
+ addc r31, r8, r0
+ addze r10, r27
+ addc r0, r9, r31
+ adde r9, r11, r10
+ addi ap, ap, -40
+ b L(6)
+
+ ALIGN(16)
+L(b00): ld r11, -24(ap)
+ ld r10, -16(ap)
+ ld r9, -8(ap)
+ ld r0, -32(ap)
+ mulld r8, r11, r26
+ mulhdu r7, r10, r25
+ mulhdu r27, r11, r26
+ mulhdu r11, r9, r24
+ mulld r10, r10, r25
+ mulld r9, r9, r24
+ addc r31, r8, r0
+ addze r0, r27
+ addc r8, r31, r10
+ adde r10, r0, r7
+ addc r0, r9, r8
+ adde r9, r11, r10
+ addi ap, ap, -48
+ b L(6)
+
+ ALIGN(16)
+L(b01): li r9, 0
+ ld r0, -8(ap)
+ addi ap, ap, -24
+ b L(6)
+
+ ALIGN(16)
+L(b10): ld r9, -8(ap)
+ ld r0, -16(ap)
+ addi ap, ap, -32
+
+ ALIGN(16)
+L(6): addi r10, n, 3
+ srdi r7, r10, 2
+ mtctr r7
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r31, -16(ap)
+ ld r10, -8(ap)
+ ld r11, 8(ap)
+ ld r12, 0(ap)
+ mulld r29, r0, r30 C rl * B4modb
+ mulhdu r0, r0, r30 C rl * B4modb
+ mulhdu r27, r10, r26
+ mulld r10, r10, r26
+ mulhdu r7, r9, r23 C rh * B5modb
+ mulld r9, r9, r23 C rh * B5modb
+ mulhdu r28, r11, r24
+ mulld r11, r11, r24
+ mulhdu r4, r12, r25
+ mulld r12, r12, r25
+ addc r8, r10, r31
+ addze r10, r27
+ addi ap, ap, -32
+ addc r27, r8, r12
+ adde r12, r10, r4
+ addc r11, r27, r11
+ adde r31, r12, r28
+ addc r12, r11, r29
+ adde r4, r31, r0
+ addc r0, r9, r12
+ adde r9, r7, r4
+ bdnz L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+` lwz r3, 8(cps)',
+` lwz r3, 12(cps)')
+ mulld r10, r9, r26
+ mulhdu r9, r9, r26
+ addc r11, r0, r10
+ addze r9, r9
+ ld r10, 0(cps)
+ subfic r8, r3, 64
+ sld r9, r9, r3
+ srd r8, r11, r8
+ sld r11, r11, r3
+ or r9, r8, r9
+ mulld r0, r9, r10
+ mulhdu r10, r9, r10
+ addi r9, r9, 1
+ addc r8, r0, r11
+ adde r0, r10, r9
+ mulld r0, r0, d
+ subf r0, r0, r11
+ cmpld cr7, r8, r0
+ bge cr7, L(9)
+ add r0, r0, d
+L(9): cmpld cr7, r0, d
+ bge- cr7, L(16)
+L(10): srd r3, r0, r3
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+
+L(16): subf r0, d, r0
+ b L(10)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,toc)
+ mflr r0
+ std r29, -24(r1)
+ std r30, -16(r1)
+ mr r29, r3
+ std r0, 16(r1)
+ std r31, -8(r1)
+ stdu r1, -144(r1)
+ cntlzd r31, r4
+ sld r30, r4, r31
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ subfic r9, r31, 64
+ li r10, 1
+ sld r10, r10, r31
+ srd r9, r3, r9
+ neg r0, r30
+ or r10, r10, r9
+ mulld r10, r10, r0
+ mulhdu r11, r10, r3
+ nor r11, r11, r11
+ subf r11, r10, r11
+ mulld r11, r11, r30
+ mulld r0, r10, r3
+ cmpld cr7, r0, r11
+ bge cr7, L(18)
+ add r11, r11, r30
+L(18): mulhdu r9, r11, r3
+ add r9, r11, r9
+ nor r9, r9, r9
+ mulld r9, r9, r30
+ mulld r0, r11, r3
+ cmpld cr7, r0, r9
+ bge cr7, L(19)
+ add r9, r9, r30
+L(19): mulhdu r0, r9, r3
+ add r0, r9, r0
+ nor r0, r0, r0
+ mulld r0, r0, r30
+ mulld r8, r9, r3
+ cmpld cr7, r8, r0
+ bge cr7, L(20)
+ add r0, r0, r30
+L(20): mulhdu r8, r0, r3
+ add r8, r0, r8
+ nor r8, r8, r8
+ mulld r8, r8, r30
+ mulld r7, r0, r3
+ cmpld cr7, r7, r8
+ bge cr7, L(21)
+ add r8, r8, r30
+L(21): srd r0, r0, r31
+ addi r1, r1, 144
+ srd r8, r8, r31
+ srd r10, r10, r31
+ srd r11, r11, r31
+ std r0, 40(r29)
+ std r31, 8(r29)
+ srd r9, r9, r31
+ ld r0, 16(r1)
+ ld r30, -16(r1)
+ std r8, 48(r29)
+ std r3, 0(r29)
+ mtlr r0
+ ld r31, -8(r1)
+ std r10, 16(r29)
+ std r11, 24(r29)
+ std r9, 32(r29)
+ ld r29, -24(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm
new file mode 100644
index 0000000..c35e0e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm
@@ -0,0 +1,132 @@
+dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1.
+
+dnl Copyright 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 1.33
+C POWER4/PPC970 1.5
+C POWER5 1.32
+C POWER6 2.35
+C POWER7 1
+
+C INPUT PARAMETERS
+define(`up',`r3')
+define(`n',`r4')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+ li r8, 0
+ li r9, 0
+ li r10, 0
+ li r11, 0
+
+ cmpdi cr6, n, 3
+ blt cr6, L(lt3)
+
+ li r0, -0x5556 C 0xFFFFFFFFFFFFAAAA
+ rldimi r0, r0, 16, 32 C 0xFFFFFFFFAAAAAAAA
+ rldimi r0, r0, 32, 63 C 0xAAAAAAAAAAAAAAAB
+ mulhdu r0, r0, n
+ srdi r0, r0, 1 C r0 = [n / 3]
+ mtctr r0
+
+ ld r5, 0(up)
+ ld r6, 8(up)
+ ld r7, 16(up)
+ addi up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): addc r8, r8, r5
+ nop
+ ld r5, 0(up)
+ adde r9, r9, r6
+ ld r6, 8(up)
+ adde r10, r10, r7
+ ld r7, 16(up)
+ addi up, up, 48
+ addze r11, r11
+ bdz L(endx)
+ addc r8, r8, r5
+ nop
+ ld r5, -24(up)
+ adde r9, r9, r6
+ ld r6, -16(up)
+ adde r10, r10, r7
+ ld r7, -8(up)
+ addze r11, r11
+ bdnz L(top)
+
+ addi up, up, 24
+L(endx):
+ addi up, up, -24
+
+L(end): addc r8, r8, r5
+ adde r9, r9, r6
+ adde r10, r10, r7
+ addze r11, r11
+
+ sldi r5, r0, 1
+ add r5, r5, r0 C r11 = n / 3 * 3
+ sub n, n, r5 C n = n mod 3
+L(lt3): cmpdi cr6, n, 1
+ blt cr6, L(2)
+
+ ld r5, 0(up)
+ addc r8, r8, r5
+ li r6, 0
+ beq cr6, L(1)
+
+ ld r6, 8(up)
+L(1): adde r9, r9, r6
+ addze r10, r10
+ addze r11, r11
+
+L(2): rldicl r0, r8, 0, 16 C r0 = r8 mod 2^48
+ srdi r3, r8, 48 C r3 = r8 div 2^48
+ rldic r4, r9, 16, 16 C r4 = (r9 mod 2^32) << 16
+ srdi r5, r9, 32 C r5 = r9 div 2^32
+ rldic r6, r10, 32, 16 C r6 = (r10 mod 2^16) << 32
+ srdi r7, r10, 16 C r7 = r10 div 2^16
+
+ add r0, r0, r3
+ add r4, r4, r5
+ add r6, r6, r7
+
+ add r0, r0, r4
+ add r6, r6, r11
+
+ add r3, r0, r6
+ blr
+EPILOGUE()
+
+C |__r10__|__r9___|__r8___|
+C |-----|-----|-----|-----|
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm
new file mode 100644
index 0000000..726339a
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm
@@ -0,0 +1,117 @@
+dnl PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 13-19
+C POWER4/PPC970 16
+C POWER5 16
+C POWER6 ?
+C POWER7 12
+
+C TODO
+C * Check if n=1 code is really an improvement. It probably isn't.
+C * Make more similar to dive_1.asm.
+
+C INPUT PARAMETERS
+define(`up', `r3')
+define(`n', `r4')
+define(`d', `r5')
+define(`cy', `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_modexact_1c_odd,toc)
+ addic. n, n, -1 C set carry as side effect
+ ld r8, 0(up)
+ bne cr0, L(2)
+ cmpld cr7, r6, r8
+ bge cr7, L(4)
+ subf r8, r6, r8
+ divdu r3, r8, d
+ mulld r3, r3, d
+ subf. r3, r3, r8
+ beqlr cr0
+ subf r3, r3, d
+ blr
+
+L(4): subf r3, r8, r6
+ divdu r8, r3, d
+ mulld r8, r8, d
+ subf r3, r8, r3
+ blr
+
+L(2): LEA( r7, binvert_limb_table)
+ rldicl r9, d, 63, 57
+ mtctr n
+ lbzx r0, r7, r9
+ mulld r7, r0, r0
+ sldi r0, r0, 1
+ mulld r7, d, r7
+ subf r0, r7, r0
+ mulld r9, r0, r0
+ sldi r0, r0, 1
+ mulld r9, d, r9
+ subf r0, r9, r0
+ mulld r7, r0, r0
+ sldi r0, r0, 1
+ mulld r7, d, r7
+ subf r9, r7, r0
+
+ ALIGN(16)
+L(loop):
+ subfe r0, r6, r8
+ ld r8, 8(up)
+ addi up, up, 8
+ mulld r0, r9, r0
+ mulhdu r6, r0, d
+ bdnz L(loop)
+
+ cmpld cr7, d, r8
+ blt cr7, L(10)
+
+ subfe r0, r0, r0
+ subf r6, r0, r6
+ cmpld cr7, r6, r8
+ subf r3, r8, r6
+ bgelr cr7
+ add r3, d, r3
+ blr
+
+L(10): subfe r0, r6, r8
+ mulld r0, r9, r0
+ mulhdu r3, r0, d
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm
new file mode 100644
index 0000000..27a8f8f
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm
@@ -0,0 +1,168 @@
+dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 7.25? not updated for last file revision
+C POWER5 7.25
+C POWER6 14
+C POWER7 2.9
+
+C TODO
+C * Try to reduce the number of needed live registers (at least r5 and r10
+C could be combined)
+C * Optimize feed-in code, for speed and size.
+C * Clean up r12/r7 usage in feed-in code.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ mr r12, r7
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ li r12, 0 C cy_limb = 0
+L(ent): ld r26, 0(up)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): mr r7, r12
+ mulld r0, r26, r6
+ mulhdu r12, r26, r6
+ addi up, up, 8
+ addc r0, r0, r7
+ std r0, 0(rp)
+ addi rp, rp, 8
+ b L(fic)
+
+L(b00): ld r27, 8(up)
+ addi up, up, 16
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r12
+ adde r7, r7, r5
+ addze r12, r8
+ std r0, 0(rp)
+ std r7, 8(rp)
+ addi rp, rp, 16
+ b L(fic)
+
+ nop C alignment
+L(b01): bdnz L(gt1)
+ mulld r0, r26, r6
+ mulhdu r8, r26, r6
+ addc r0, r0, r12
+ std r0, 0(rp)
+ b L(ret)
+L(gt1): ld r27, 8(up)
+ nop
+ mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ ld r26, 16(up)
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ mulld r9, r26, r6
+ mulhdu r10, r26, r6
+ addc r0, r0, r12
+ adde r7, r7, r5
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r7, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 24
+ addi rp, rp, 24
+ b L(fic)
+
+ nop
+L(fic): ld r26, 0(up)
+L(b10): ld r27, 8(up)
+ addi up, up, 16
+ bdz L(end)
+
+L(top): mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r26, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r5
+ mulld r9, r26, r6
+ mulhdu r10, r26, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r26, 16(up)
+ ld r27, 24(up)
+ std r0, 0(rp)
+ adde r9, r9, r8
+ std r7, 8(rp)
+ adde r11, r11, r10
+ std r9, 16(rp)
+ addi up, up, 32
+ std r11, 24(rp)
+
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): mulld r0, r26, r6
+ mulhdu r5, r26, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r5
+ std r0, 0(rp)
+ std r7, 8(rp)
+L(ret): addze r3, r8
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm
new file mode 100644
index 0000000..1873187
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm
@@ -0,0 +1,708 @@
+dnl PowerPC-64 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 24
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ nop
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+
+ mr outer_rp, rp
+ mr outer_up, up
+
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+
+ rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi un, un, 1 C compute count...
+ srdi un, un, 2 C ...for ctr
+ mtctr un C copy inner loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+
+ ALIGN(16)
+L(b3): mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addic r0, r0, 0
+ std r0, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_3)
+
+ ALIGN(16)
+L(lo_m_3):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_3)
+
+ ALIGN(16)
+L(end_m_3):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addic. vn, vn, -1
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_3):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 8
+ mr up, outer_up
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addc r0, r0, r28
+ std r0, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_3)
+
+ ALIGN(16) C registers dying
+L(lo_3):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_3) C
+
+ ALIGN(16)
+L(end_3):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+
+ addic. vn, vn, -1
+ bne L(outer_lo_3)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b0): ld r27, 8(up)
+ addi up, up, 8
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ std r0, 0(rp)
+ std r24, 8(rp)
+ addi rp, rp, 8
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_0)
+
+ ALIGN(16)
+L(lo_m_0):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_0)
+
+ ALIGN(16)
+L(end_m_0):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_0):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 16
+ addi up, outer_up, 8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -8(up)
+ ld r27, 0(up)
+ ld r28, -8(rp)
+ ld r29, 0(rp)
+ nop
+ nop
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ addc r0, r0, r28
+ std r0, -8(rp)
+ adde r24, r24, r29
+ std r24, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_0)
+
+ ALIGN(16) C registers dying
+L(lo_0):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_0) C
+
+ ALIGN(16)
+L(end_0):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_0)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b1): ld r27, 8(up)
+ nop
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 16(up)
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r24, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 16
+ addi rp, rp, 16
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_m_1)
+
+ ALIGN(16)
+L(lo_m_1):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(lo_m_1)
+
+ ALIGN(16)
+L(end_m_1):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_1):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 24
+ addi up, outer_up, 16
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -16(up)
+ ld r27, -8(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 0(up)
+ ld r28, -16(rp)
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, -8(rp)
+ ld r30, 0(rp)
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, -16(rp)
+ adde r24, r24, r29
+ std r24, -8(rp)
+ adde r9, r9, r30
+ std r9, 0(rp)
+ ld r26, 8(up)
+ ld r27, 16(up)
+ bdz L(end_1)
+
+ ALIGN(16) C registers dying
+L(lo_1):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_1) C
+
+ ALIGN(16)
+L(end_1):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_1)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b2): ld r27, 8(up)
+ addi up, up, -8
+ addi rp, rp, -8
+ li r12, 0
+ addic r12, r12, 0
+
+ ALIGN(16)
+L(lo_m_2):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ ld r26, 24(up)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r27, 32(up)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r31
+ mulld r9, r26, v0
+ mulhdu r10, r26, v0
+ ld r26, 40(up)
+ nop
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ld r27, 48(up)
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r10
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+
+ addi rp, rp, 32
+ bdnz L(lo_m_2)
+
+ ALIGN(16)
+L(end_m_2):
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+
+ adde r0, r0, r12
+ adde r24, r24, r31
+
+ std r0, 8(rp)
+ addze r8, r8
+ std r24, 16(rp)
+ addic. vn, vn, -1
+ std r8, 24(rp)
+ nop
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_2):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 0
+ addi up, outer_up, -8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 8(up)
+ ld r27, 16(up)
+ li r12, 0
+ addic r12, r12, 0
+
+ ALIGN(16) C registers dying
+L(lo_2):
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 24(up) C
+ ld r28, 8(rp) C
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ ld r27, 32(up) C
+ ld r29, 16(rp) C
+ adde r0, r0, r12 C 0 12
+ adde r24, r24, r10 C 24 10
+ mulld r9, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ ld r26, 40(up) C
+ ld r30, 24(rp) C
+ mulld r11, r27, v0 C
+ mulhdu r12, r27, v0 C 27
+ ld r27, 48(up) C
+ ld r31, 32(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 8(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, 16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, 24(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 32(rp) C 11
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ bdnz L(lo_2) C
+
+ ALIGN(16)
+L(end_2):
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ ld r28, 8(rp)
+ nop
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ ld r29, 16(rp)
+ nop
+ adde r0, r0, r12
+ adde r24, r24, r10
+ addze r8, r8
+ addic. vn, vn, -1
+ addc r0, r0, r28
+ std r0, 8(rp)
+ adde r24, r24, r29
+ std r24, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ bne L(outer_lo_2)
+ b L(ret)
+
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h
new file mode 100644
index 0000000..61a437b
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h
@@ -0,0 +1,179 @@
+/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define MUL_TOOM22_THRESHOLD 10
+#define MUL_TOOM33_THRESHOLD 33
+#define MUL_TOOM44_THRESHOLD 46
+#define MUL_TOOM6H_THRESHOLD 77
+#define MUL_TOOM8H_THRESHOLD 139
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 47
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 14
+#define SQR_TOOM3_THRESHOLD 45
+#define SQR_TOOM4_THRESHOLD 64
+#define SQR_TOOM6_THRESHOLD 85
+#define SQR_TOOM8_THRESHOLD 139
+
+#define MULMID_TOOM42_THRESHOLD 22
+
+#define MULMOD_BNM1_THRESHOLD 8
+#define SQRMOD_BNM1_THRESHOLD 10
+
+#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 220, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
+ { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \
+ { 7, 7}, { 15, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 23,10}, { 15, 9}, \
+ { 35, 8}, { 71,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 79,10}, { 55,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 71, 9}, { 143, 8}, { 287,10}, \
+ { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \
+ { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \
+ { 351,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 175,10}, { 351,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 223,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
+ { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,12}, { 223,11}, { 447,10}, { 895,13}, \
+ { 127,12}, { 255,11}, { 511,12}, { 287,11}, \
+ { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 120
+#define MUL_FFT_THRESHOLD 2688
+
+#define SQR_FFT_MODF_THRESHOLD 188 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 188, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \
+ { 13, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \
+ { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \
+ { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \
+ { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79, 8}, { 159,10}, { 47, 9}, { 95, 8}, \
+ { 191,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \
+ { 79, 9}, { 159,11}, { 47,10}, { 95, 9}, \
+ { 191,12}, { 31,11}, { 63,10}, { 127, 9}, \
+ { 255, 8}, { 511,10}, { 143, 9}, { 287,11}, \
+ { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \
+ { 175,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,10}, { 223,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \
+ { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 223,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \
+ { 319,11}, { 639,12}, { 351,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 447,11}, { 895,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 118
+#define SQR_FFT_THRESHOLD 1728
+
+#define MULLO_BASECASE_THRESHOLD 2
+#define MULLO_DC_THRESHOLD 27
+#define MULLO_MUL_N_THRESHOLD 2511
+
+#define DC_DIV_QR_THRESHOLD 23
+#define DC_DIVAPPR_Q_THRESHOLD 87
+#define DC_BDIV_QR_THRESHOLD 27
+#define DC_BDIV_Q_THRESHOLD 60
+
+#define INV_MULMOD_BNM1_THRESHOLD 27
+#define INV_NEWTON_THRESHOLD 91
+#define INV_APPR_THRESHOLD 91
+
+#define BINV_NEWTON_THRESHOLD 115
+#define REDC_1_TO_REDC_N_THRESHOLD 31
+
+#define MU_DIV_QR_THRESHOLD 551
+#define MU_DIVAPPR_Q_THRESHOLD 551
+#define MUPI_DIV_QR_THRESHOLD 42
+#define MU_BDIV_QR_THRESHOLD 483
+#define MU_BDIV_Q_THRESHOLD 492
+
+#define POWM_SEC_TABLE 2,23,140,556,713,746
+
+#define MATRIX22_STRASSEN_THRESHOLD 8
+#define HGCD_THRESHOLD 56
+#define HGCD_APPR_THRESHOLD 51
+#define HGCD_REDUCE_THRESHOLD 688
+#define GCD_DC_THRESHOLD 333
+#define GCDEXT_DC_THRESHOLD 126
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 17
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 375
+#define SET_STR_PRECOMPUTE_THRESHOLD 812
+
+#define FAC_DSC_THRESHOLD 351
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h
new file mode 100644
index 0000000..3c40fb9
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h
@@ -0,0 +1,214 @@
+/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010, 2014, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz PPC970 */
+/* FFT tuning limit = 15 M */
+/* Generated by tuneup.c, 2015-10-09, gcc 4.0 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 35
+
+#define DIV_1_VS_MUL_1_PERCENT 218
+
+#define MUL_TOOM22_THRESHOLD 14
+#define MUL_TOOM33_THRESHOLD 53
+#define MUL_TOOM44_THRESHOLD 136
+#define MUL_TOOM6H_THRESHOLD 197
+#define MUL_TOOM8H_THRESHOLD 272
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 76
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 202
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 430
+
+#define MULMID_TOOM42_THRESHOLD 34
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 444, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 13, 5}, { 28, 6}, \
+ { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \
+ { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 63, 9}, { 127,10}, { 87,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63,10}, \
+ { 135, 9}, { 271,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 167,11}, { 95, 9}, { 383, 8}, \
+ { 767,10}, { 199,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \
+ { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 335, 9}, { 671,10}, { 351,12}, \
+ { 95,10}, { 383, 9}, { 767,10}, { 415, 9}, \
+ { 831,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,10}, { 607,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,10}, { 895,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 895,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 703,14}, { 383,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \
+ { 1663,14}, { 895,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD 9088
+
+#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 344, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 14, 6}, \
+ { 29, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 135, 9}, { 271, 8}, { 543,11}, \
+ { 79, 9}, { 319, 8}, { 639,11}, { 95,10}, \
+ { 191, 9}, { 383, 8}, { 767,10}, { 207, 9}, \
+ { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \
+ { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \
+ { 639,10}, { 335,11}, { 175,10}, { 351, 9}, \
+ { 703,11}, { 191,10}, { 383, 9}, { 767,11}, \
+ { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,10}, \
+ { 607,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,10}, { 895,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 671,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,10}, { 1663,11}, { 895,12}, { 479,14}, \
+ { 127,13}, { 255,12}, { 607,13}, { 319,12}, \
+ { 703,13}, { 383,12}, { 831,11}, { 1663,12}, \
+ { 927,14}, { 255,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \
+ { 703,14}, { 383,13}, { 895,15}, { 255,14}, \
+ { 511,13}, { 1023,12}, { 2175,13}, { 1151,12}, \
+ { 2303,13}, { 1215,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1407,14}, { 767,13}, { 1663,14}, \
+ { 895,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 174
+#define SQR_FFT_THRESHOLD 6272
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 43
+#define MULLO_MUL_N_THRESHOLD 18087
+#define SQRLO_BASECASE_THRESHOLD 2
+#define SQRLO_DC_THRESHOLD 79
+#define SQRLO_SQR_THRESHOLD 12322
+
+#define DC_DIV_QR_THRESHOLD 42
+#define DC_DIVAPPR_Q_THRESHOLD 159
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 110
+
+#define INV_MULMOD_BNM1_THRESHOLD 26
+#define INV_NEWTON_THRESHOLD 177
+#define INV_APPR_THRESHOLD 165
+
+#define BINV_NEWTON_THRESHOLD 198
+#define REDC_1_TO_REDC_N_THRESHOLD 56
+
+#define MU_DIV_QR_THRESHOLD 1017
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 90
+#define MU_BDIV_QR_THRESHOLD 924
+#define MU_BDIV_Q_THRESHOLD 1017
+
+#define POWM_SEC_TABLE 7,17,86,579,1925
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 23
+#define SET_STR_DC_THRESHOLD 788
+#define SET_STR_PRECOMPUTE_THRESHOLD 1713
+
+#define FAC_DSC_THRESHOLD 512
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 10
+#define HGCD_THRESHOLD 113
+#define HGCD_APPR_THRESHOLD 115
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 330
+#define GCDEXT_DC_THRESHOLD 242
+#define JACOBI_BASE_METHOD 4
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h
new file mode 100644
index 0000000..15b009c
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h
@@ -0,0 +1,219 @@
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* POWER5 (friggms.hpc.ntnu.no) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 40
+
+#define MUL_TOOM22_THRESHOLD 21
+#define MUL_TOOM33_THRESHOLD 24
+#define MUL_TOOM44_THRESHOLD 70
+#define MUL_TOOM6H_THRESHOLD 262
+#define MUL_TOOM8H_THRESHOLD 393
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 81
+#define SQR_TOOM4_THRESHOLD 142
+#define SQR_TOOM6_THRESHOLD 189
+#define SQR_TOOM8_THRESHOLD 284
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 319,12}, \
+ { 95,11}, { 191,10}, { 383,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575, 9}, { 1151,11}, \
+ { 319,10}, { 639,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \
+ { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,10}, { 4863,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 959,12}, \
+ { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,12}, { 2431,11}, \
+ { 4863,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
+ { 895,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \
+ { 1663,13}, { 3327,14}, { 1919,13}, { 3839,16}, \
+ { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,12}, { 11775,15}, { 1535,14}, \
+ { 3327,15}, { 1791,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 208
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 272, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 19, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
+ { 31,10}, { 71, 9}, { 143,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \
+ { 143,11}, { 79,10}, { 159, 9}, { 319,10}, \
+ { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,11}, { 175,10}, { 351,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,10}, { 959,12}, { 255,11}, { 511,10}, \
+ { 1023,11}, { 543,12}, { 287,11}, { 575,12}, \
+ { 319,11}, { 639,12}, { 351,11}, { 703,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,11}, { 895,12}, { 479,11}, \
+ { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \
+ { 543,11}, { 1087,12}, { 575,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \
+ { 959,12}, { 1919,15}, { 255,14}, { 511,13}, \
+ { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \
+ { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \
+ { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1663,13}, { 3327,14}, { 1919,13}, \
+ { 3839,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3327,15}, { 1791,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 190
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 6
+#define MULLO_DC_THRESHOLD 60
+#define MULLO_MUL_N_THRESHOLD 7463
+
+#define DC_DIV_QR_THRESHOLD 58
+#define DC_DIVAPPR_Q_THRESHOLD 232
+#define DC_BDIV_QR_THRESHOLD 78
+#define DC_BDIV_Q_THRESHOLD 238
+
+#define INV_MULMOD_BNM1_THRESHOLD 92
+#define INV_NEWTON_THRESHOLD 155
+#define INV_APPR_THRESHOLD 157
+
+#define BINV_NEWTON_THRESHOLD 155
+#define REDC_1_TO_REDC_N_THRESHOLD 61
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 79
+#define MU_BDIV_QR_THRESHOLD 823
+#define MU_BDIV_Q_THRESHOLD 942
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD_THRESHOLD 74
+#define HGCD_APPR_THRESHOLD 155
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 351
+#define GCDEXT_DC_THRESHOLD 288
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 650
+#define SET_STR_PRECOMPUTE_THRESHOLD 1585
+
+#define FAC_DSC_THRESHOLD 662
+#define FAC_ODD_THRESHOLD 28
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 0000000..c572b91
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm
@@ -0,0 +1,185 @@
+dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 12.25 12.8
+C POWER7 ? ?
+
+C TODO
+C * Reduce register usage.
+C * Schedule function entry code.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_addmul_1)
+ define(func_nc, mpn_addmul_1c) C FIXME: not really supported
+ define(AM, `$1')
+ define(SM, `')
+ define(CLRRSC, `addic $1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_submul_1)
+ define(func_nc, mpn_submul_1c) C FIXME: not really supported
+ define(AM, `')
+ define(SM, `$1')
+ define(CLRRSC, `subfc $1, r0, r0')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 3 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r8, 0(up)
+ ld r7, 8(up)
+ ld r27, 16(up)
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r29, -16(rp)
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r5, r5, r29
+ b L(l3)
+
+L(b2): ld r7, 0(up)
+ ld r27, 8(up)
+ addi up, up, 8
+ addi rp, rp, 8
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ addc r11, r11, r7
+ addze r12, r27
+ ADDSUB r9, r9, r30
+ b L(l2)
+
+L(b1): ld r27, 0(up)
+ ld r31, 0(rp)
+ mulld r11, r27, v0
+ mulhdu r12, r27, v0
+ ADDSUB r11, r11, r31
+ b L(l1)
+
+L(b0): addi up, up, -8
+ addi rp, rp, -8
+ CLRRSC( r12) C clear r12 and clr/set cy
+
+ ALIGN(32)
+L(top):
+SM(` subfe r11, r0, r0') C complement...
+SM(` addic r11, r11, 1') C ...carry flag
+ ld r10, 8(up)
+ ld r8, 16(up)
+ ld r7, 24(up)
+ ld r27, 32(up)
+ addi up, up, 32
+ addi rp, rp, 32
+ mulld r0, r10, v0
+ mulhdu r10, r10, v0
+ mulld r5, r8, v0
+ mulhdu r8, r8, v0
+ mulld r9, r7, v0
+ mulhdu r7, r7, v0
+ mulld r11, r27, v0
+ mulhdu r27, r27, v0
+ ld r28, -24(rp)
+ adde r0, r0, r12
+ ld r29, -16(rp)
+ adde r5, r5, r10
+ ld r30, -8(rp)
+ ld r31, 0(rp)
+ adde r9, r9, r8
+ adde r11, r11, r7
+ addze r12, r27
+ ADDSUB r0, r0, r28
+ std r0, -24(rp)
+ ADDSUBC r5, r5, r29
+L(l3): std r5, -16(rp)
+ ADDSUBC r9, r9, r30
+L(l2): std r9, -8(rp)
+ ADDSUBC r11, r11, r31
+L(l1): std r11, 0(rp)
+ bdnz L(top)
+
+AM(` addze r3, r12')
+SM(` subfe r11, r0, r0') C complement...
+ ld r31, -8(r1)
+SM(` subf r3, r11, r12')
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h
new file mode 100644
index 0000000..c7e2f89
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h
@@ -0,0 +1,160 @@
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3500 MHz POWER6 (kolga.bibsys.no) */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6
+#define USE_PREINV_DIVREM_1 0
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 50
+#define MUL_TOOM44_THRESHOLD 106
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 49
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 226
+#define SQR_TOOM8_THRESHOLD 272
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 14
+#define SQRMOD_BNM1_THRESHOLD 14
+
+#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
+ { 33, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \
+ { 31,10}, { 71,11}, { 47,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \
+ { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207,12}, { 63,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
+ { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 79
+#define MUL_FFT_THRESHOLD 3520
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 280, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \
+ { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 47,11}, { 31,10}, { 71, 9}, \
+ { 143,11}, { 47,12}, { 31,11}, { 63, 9}, \
+ { 255, 8}, { 511, 9}, { 271,10}, { 143,11}, \
+ { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \
+ { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511, 8}, { 1023,10}, { 271, 9}, { 543,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 80
+#define SQR_FFT_THRESHOLD 2752
+
+#define MULLO_BASECASE_THRESHOLD 5
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 2995
+
+#define DC_DIV_QR_THRESHOLD 59
+#define DC_DIVAPPR_Q_THRESHOLD 200
+#define DC_BDIV_QR_THRESHOLD 70
+#define DC_BDIV_Q_THRESHOLD 168
+
+#define INV_MULMOD_BNM1_THRESHOLD 53
+#define INV_NEWTON_THRESHOLD 170
+#define INV_APPR_THRESHOLD 166
+
+#define BINV_NEWTON_THRESHOLD 220
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 942
+#define MUPI_DIV_QR_THRESHOLD 57
+#define MU_BDIV_QR_THRESHOLD 889
+#define MU_BDIV_Q_THRESHOLD 1078
+
+#define POWM_SEC_TABLE 4,26,216,804,1731
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 106
+#define HGCD_APPR_THRESHOLD 109
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 492
+#define GCDEXT_DC_THRESHOLD 327
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 28
+#define SET_STR_DC_THRESHOLD 537
+#define SET_STR_PRECOMPUTE_THRESHOLD 1576
+
+#define FAC_DSC_THRESHOLD 426
+#define FAC_ODD_THRESHOLD 0 /* always */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm
new file mode 100644
index 0000000..3d32b46
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm
@@ -0,0 +1,589 @@
+dnl PowerPC-64 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 12.25
+
+C TODO
+C * Reduce register usage. At least 4 register less can be used.
+C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C would bring us to 9 c/l.
+C * The bdz insns for b1 and b2 will never branch,
+C * Align things better, perhaps by moving things like pointer updates from
+C before to after loops.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ nop
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+ std r20, -96(r1)
+
+ mr outer_rp, rp
+ mr outer_up, up
+
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+
+ rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi un, un, 4 C compute count...
+ srdi un, un, 2 C ...for ctr
+ mtctr un C copy inner loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+
+ ALIGN(16)
+L(b3):
+ ld r27, 8(up)
+ ld r20, 16(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r10, r20, v0
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ std r0, 0(rp)
+ std r24, 8(rp)
+ std r9, 16(rp)
+ addi up, up, 16
+ addi rp, rp, 16
+ bdz L(end_m_3)
+
+ ALIGN(32)
+L(lo_m_3):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_3)
+
+ ALIGN(16)
+L(end_m_3):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_3):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 24
+ addi up, outer_up, 16
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -16(up)
+ ld r27, -8(up)
+ ld r20, 0(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r10, r20, v0
+ ld r28, -16(rp)
+ ld r29, -8(rp)
+ ld r30, 0(rp)
+ addc r24, r24, r31
+ adde r9, r9, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, -16(rp)
+ adde r24, r24, r29
+ std r24, -8(rp)
+ adde r9, r9, r30
+ std r9, 0(rp)
+ bdz L(end_3)
+
+ ALIGN(32) C registers dying
+L(lo_3):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_3) C
+
+ ALIGN(16)
+L(end_3):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_3)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b1):
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addic r0, r0, 0
+ std r0, 0(rp)
+ bdz L(end_m_1)
+
+ ALIGN(16)
+L(lo_m_1):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_1)
+
+ ALIGN(16)
+L(end_m_1):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_1):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 8
+ mr up, outer_up
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, 0(up)
+ ld r28, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r12, r26, v0
+ addc r0, r0, r28
+ std r0, 0(rp)
+ bdz L(end_1)
+
+ ALIGN(32) C registers dying
+L(lo_1):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_1) C
+
+ ALIGN(16)
+L(end_1):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_1)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b0):
+ addi up, up, -8
+ addi rp, rp, -8
+ li r12, 0
+ addic r12, r12, 0
+ bdz L(end_m_0)
+
+ ALIGN(16)
+L(lo_m_0):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_0)
+
+ ALIGN(16)
+L(end_m_0):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_0):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 0
+ addi up, outer_up, -8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ li r12, 0
+ addic r12, r12, 0
+ bdz L(end_0)
+
+ ALIGN(32) C registers dying
+L(lo_0):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_0) C
+
+ ALIGN(16)
+L(end_0):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_0)
+ b L(ret)
+
+
+ ALIGN(16)
+L(b2): ld r27, 8(up)
+ addi up, up, 8
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ std r0, 0(rp)
+ std r24, 8(rp)
+ addi rp, rp, 8
+ bdz L(end_m_2)
+
+ ALIGN(16)
+L(lo_m_2):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up)
+ ld r21, 32(up)
+ mulld r0, r26, v0
+ mulhdu r31, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ mulld r9, r20, v0
+ mulhdu r27, r20, v0
+ mulld r11, r21, v0
+ mulhdu r26, r21, v0
+ adde r0, r0, r12
+ adde r24, r24, r31
+ std r0, 8(rp)
+ adde r9, r9, r8
+ std r24, 16(rp)
+ adde r11, r11, r27
+ std r9, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ mr r12, r26
+ bdnz L(lo_m_2)
+
+ ALIGN(16)
+L(end_m_2):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ beq L(ret)
+
+ ALIGN(16)
+L(outer_lo_2):
+ mtctr un C copy inner loop count into ctr
+ addi rp, outer_rp, 16
+ addi up, outer_up, 8
+ addi outer_rp, outer_rp, 8
+ ld v0, 0(vp) C new v limb
+ addi vp, vp, 8
+ ld r26, -8(up)
+ ld r27, 0(up)
+ ld r28, -8(rp)
+ ld r29, 0(rp)
+ mulld r0, r26, v0
+ mulhdu r10, r26, v0
+ mulld r24, r27, v0
+ mulhdu r8, r27, v0
+ addc r24, r24, r10
+ addze r12, r8
+ addc r0, r0, r28
+ std r0, -8(rp)
+ adde r24, r24, r29
+ std r24, 0(rp)
+ bdz L(end_2)
+
+ ALIGN(16) C registers dying
+L(lo_2):
+ ld r26, 8(up)
+ ld r27, 16(up)
+ ld r20, 24(up) C
+ ld r21, 32(up) C
+ addi up, up, 32 C
+ addi rp, rp, 32 C
+ mulld r0, r26, v0 C
+ mulhdu r10, r26, v0 C 26
+ mulld r24, r27, v0 C
+ mulhdu r8, r27, v0 C 27
+ mulld r9, r20, v0 C
+ mulhdu r27, r20, v0 C 26
+ mulld r11, r21, v0 C
+ mulhdu r26, r21, v0 C 27
+ ld r28, -24(rp) C
+ adde r0, r0, r12 C 0 12
+ ld r29, -16(rp) C
+ adde r24, r24, r10 C 24 10
+ ld r30, -8(rp) C
+ ld r31, 0(rp) C
+ adde r9, r9, r8 C 8 9
+ adde r11, r11, r27 C 27 11
+ addze r12, r26 C 26
+ addc r0, r0, r28 C 0 28
+ std r0, -24(rp) C 0
+ adde r24, r24, r29 C 7 29
+ std r24, -16(rp) C 7
+ adde r9, r9, r30 C 9 30
+ std r9, -8(rp) C 9
+ adde r11, r11, r31 C 11 31
+ std r11, 0(rp) C 11
+ bdnz L(lo_2) C
+
+ ALIGN(16)
+L(end_2):
+ addze r12, r12
+ addic. vn, vn, -1
+ std r12, 8(rp)
+ bne L(outer_lo_2)
+C b L(ret)
+
+L(ret): ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ ld r20, -96(r1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm
new file mode 100644
index 0000000..8731e01
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm
@@ -0,0 +1,135 @@
+dnl PowerPC-64 mpn_mul_2 and mpn_addmul_2.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C mul_2 addmul_2
+C POWER3/PPC630 ? ?
+C POWER4/PPC970 ? ?
+C POWER5 ? ?
+C POWER6 ? ?
+C POWER7-SMT4 3 3
+C POWER7-SMT2 ? ?
+C POWER7-SMT1 ? ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vp', `r6')
+
+define(`cy0', `r10')
+ifdef(`EXTRA_REGISTER',
+` define(`cy1', EXTRA_REGISTER)',
+` define(`cy1', `r31')')
+
+ifdef(`OPERATION_mul_2',`
+ define(`AM', `')
+ define(`ADDX', `addc')
+ define(`func', `mpn_mul_2')
+')
+ifdef(`OPERATION_addmul_2',`
+ define(`AM', `$1')
+ define(`ADDX', `adde')
+ define(`func', `mpn_addmul_2')
+')
+
+MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
+
+ASM_START()
+PROLOGUE(func)
+
+ifdef(`EXTRA_REGISTER',,`
+ std r31, -8(r1)
+')
+ andi. r12, n, 1
+ addi r0, n, 1
+ srdi r0, r0, 1
+ mtctr r0
+ ld r11, 0(vp) C v0
+ li cy0, 0
+ ld r12, 8(vp) C v1
+ li cy1, 0
+ ld r5, 0(up)
+ beq L(lo0)
+ addi up, up, -8
+ addi rp, rp, -8
+ b L(lo1)
+
+ ALIGN(32)
+L(top):
+AM(` ld r0, -8(rp)')
+ ld r5, 0(up)
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ adde cy0, r7, cy1
+ std r6, -8(rp)
+ addze cy1, r9
+L(lo0): mulld r6, r11, r5 C v0 * u[i] weight 0
+ mulhdu r7, r11, r5 C v0 * u[i] weight 1
+ mulld r8, r12, r5 C v1 * u[i] weight 1
+ mulhdu r9, r12, r5 C v1 * u[i] weight 2
+AM(` ld r0, 0(rp)')
+ ld r5, 8(up)
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ adde cy0, r7, cy1
+ std r6, 0(rp)
+ addze cy1, r9
+L(lo1): mulld r6, r11, r5 C v0 * u[i] weight 0
+ mulhdu r7, r11, r5 C v0 * u[i] weight 1
+ addi up, up, 16
+ addi rp, rp, 16
+ mulld r8, r12, r5 C v1 * u[i] weight 1
+ mulhdu r9, r12, r5 C v1 * u[i] weight 2
+ bdnz L(top)
+
+L(end):
+AM(` ld r0, -8(rp)')
+AM(` addc r6, r6, r0')
+ ADDX r7, r7, r8
+ addze r9, r9
+ addc r6, r6, cy0
+ std r6, -8(rp)
+ adde cy0, r7, cy1
+ addze cy1, r9
+ std cy0, 0(rp)
+ mr r3, cy1
+
+ifdef(`EXTRA_REGISTER',,`
+ ld r31, -8(r1)
+')
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm
new file mode 100644
index 0000000..857c701
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm
@@ -0,0 +1,128 @@
+dnl PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.18
+
+C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an
+C anomaly.
+
+ifdef(`OPERATION_add_n',`
+ define(ADDSUBC, adde)
+ define(ADDSUB, addc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)
+ define(GENRVAL, `addi r3, r3, 1')
+ define(SETCBR, `addic r0, $1, -1')
+ define(CLRCB, `addic r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+ define(ADDSUBC, subfe)
+ define(ADDSUB, subfc)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)
+ define(GENRVAL, `neg r3, r3')
+ define(SETCBR, `subfic r0, $1, 0')
+ define(CLRCB, `addic r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ASM_START()
+PROLOGUE(func_nc)
+ SETCBR(r7)
+ b L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+ CLRCB
+L(ent):
+ andi. r7, n, 1
+ beq L(bx0)
+
+L(bx1): ld r7, 0(up)
+ ld r9, 0(vp)
+ ADDSUBC r11, r9, r7
+ std r11, 0(rp)
+ cmpldi cr6, n, 1
+ beq cr6, L(end)
+ addi up, up, 8
+ addi vp, vp, 8
+ addi rp, rp, 8
+
+L(bx0): addi r0, n, 2 C compute branch...
+ srdi r0, r0, 2 C ...count
+ mtctr r0
+
+ andi. r7, n, 2
+ bne L(mid)
+
+ addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, 16
+
+ ALIGN(32)
+L(top): ld r6, -16(up)
+ ld r7, -8(up)
+ ld r8, -16(vp)
+ ld r9, -8(vp)
+ ADDSUBC r10, r8, r6
+ ADDSUBC r11, r9, r7
+ std r10, -16(rp)
+ std r11, -8(rp)
+L(mid): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r8, 0(vp)
+ ld r9, 8(vp)
+ ADDSUBC r10, r8, r6
+ ADDSUBC r11, r9, r7
+ std r10, 0(rp)
+ std r11, 8(rp)
+ addi up, up, 32
+ addi vp, vp, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): subfe r3, r0, r0 C -cy
+ GENRVAL
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..ddf5fd8
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..3f9d88d
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
new file mode 100644
index 0000000..5251202
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
@@ -0,0 +1,129 @@
+dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 2.5
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`DO_add', `
+ define(`ADDSUBC', `addc $1, $2, $3')
+ define(`ADDSUBE', `adde $1, $2, $3')
+ define(INITCY, `addic $1, r1, 0')
+ define(RETVAL, `addze r3, $1')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADDSUBC', `subfc $1, $2, $3')
+ define(`ADDSUBE', `subfe $1, $2, $3')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `subfze r3, $1
+ neg r3, r3')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADDSUBC', `subfc $1, $3, $2')
+ define(`ADDSUBE', `subfe $1, $3, $2')
+ define(INITCY, `addic $1, r1, -1')
+ define(RETVAL, `addme r3, $1')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+define(`s0', `r0') define(`s1', `r9')
+define(`u0', `r6') define(`u1', `r7')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+ rldic r7, n, 3, 59
+ add up, up, r7
+ add vp, vp, r7
+ add rp, rp, r7
+
+ifdef(`DO_add', `
+ addic r0, n, 3 C set cy flag as side effect
+',`
+ subfc r0, r0, r0 C set cy flag
+ addi r0, n, 3
+')
+ srdi r0, r0, 2
+ mtctr r0
+
+ andi. r0, n, 1
+ beq L(bx0)
+
+L(bx1): andi. r0, n, 2
+ li s0, 0
+ bne L(lo3)
+ b L(lo1)
+
+L(bx0): andi. r0, n, 2
+ li s1, 0
+ bne L(lo2)
+
+ ALIGN(32)
+L(top): addi rp, rp, 32
+ ld v0, 0(vp)
+ addi vp, vp, 32
+ rldimi s1, v0, LSH, 0
+ ld u0, 0(up)
+ addi up, up, 32
+ srdi s0, v0, RSH
+ ADDSUBE(s1, s1, u0)
+ std s1, -32(rp)
+L(lo3): ld v1, -24(vp)
+ rldimi s0, v1, LSH, 0
+ ld u1, -24(up)
+ srdi s1, v1, RSH
+ ADDSUBE(s0, s0, u1)
+ std s0, -24(rp)
+L(lo2): ld v0, -16(vp)
+ rldimi s1, v0, LSH, 0
+ ld u0, -16(up)
+ srdi s0, v0, RSH
+ ADDSUBE(s1, s1, u0)
+ std s1, -16(rp)
+L(lo1): ld v1, -8(vp)
+ rldimi s0, v1, LSH, 0
+ ld u1, -8(up)
+ srdi s1, v1, RSH
+ ADDSUBE(s0, s0, u1)
+ std s0, -8(rp)
+ bdnz L(top) C decrement CTR and loop back
+
+ RETVAL( s1)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm
new file mode 100644
index 0000000..f04e896
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm
@@ -0,0 +1,67 @@
+dnl PowerPC-64 mpn_gcd_11.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 7.6 obsolete
+C POWER8 ?
+C POWER9 ?
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+C INPUT PARAMETERS
+define(`u0', `r3')
+define(`v0', `r4')
+
+define(`cnt', `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+ li r12, 63
+ b L(odd)
+
+ ALIGN(16)
+L(top): and r8, r11, r10 C isolate lsb
+ cntlzd cnt, r8
+ isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |u - v|
+ subf cnt, cnt, r12 C cnt = 63-cnt
+ srd u0, u0, cnt
+L(odd): cmpld cr7, v0, u0
+ subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ bne cr7, L(top)
+
+L(end): blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm
new file mode 100644
index 0000000..ade30e4
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm
@@ -0,0 +1,146 @@
+dnl PowerPC-64 mpn_gcd_22 optimised for POWER7 and POWER8.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 12.3
+C POWER8 13.4
+C POWER9 10.6
+
+C We define SLOW if this target uses a slow struct return mechanism, with
+C r3 as an implicit parameter for the struct pointer.
+undefine(`SLOW')dnl
+ifdef(`AIX',`define(`SLOW',`due to AIX')',`
+ ifdef(`DARWIN',,`
+ ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl
+ ')
+')
+
+ifdef(`SLOW',`
+define(`IFSLOW', `$1')
+define(`u1', `r4')
+define(`u0', `r5')
+define(`v1', `r6')
+define(`v0', `r7')
+',`
+define(`IFSLOW', `')
+define(`u1', `r3')
+define(`u0', `r4')
+define(`v1', `r5')
+define(`v0', `r6')
+')
+
+define(`tmp', `r0')
+define(`t0', `r8')
+define(`t1', `r9')
+define(`s0', `r10')
+define(`s1', `r11')
+define(`cnt', `r12')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+L(top): subfc. t0, v0, u0 C 0 12
+ beq cr0, L(lowz)
+ subfe t1, v1, u1 C 2 14
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subfc s0, u0, v0 C 0
+ subfe s1, u1, v1 C 2
+
+L(bck): and tmp, s0, t0 C 2
+ cntlzd cnt, tmp C 4
+ addi tmp, cnt, 1 C 6
+ subfic cnt, cnt, 63 C 6
+
+ isel v0, v0, u0, 2 C 6 use condition set by subfe
+ isel v1, v1, u1, 2 C 6
+ isel u0, t0, s0, 2 C 6
+ isel u1, t1, s1, 2 C 6
+
+ srd u0, u0, cnt C 8
+ sld tmp, u1, tmp C 8
+ srd u1, u1, cnt C 8
+ or u0, u0, tmp C 10
+
+ or. r0, u1, v1 C 10
+ bne L(top)
+
+
+ li r0, 63
+ b L(odd)
+ ALIGN(16)
+L(top1):isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |u - v|
+ subf cnt, cnt, r0 C cnt = 63-cnt
+ srd u0, u0, cnt
+L(odd): subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ cmpld cr7, v0, u0
+ and r8, r11, r10 C isolate lsb
+ cntlzd cnt, r8
+ bne cr7, L(top1)
+
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std r10, 8(r3) C zero
+',`
+ mr r3, v0
+ li r4, 0
+')
+ blr
+
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ subfc. t0, v1, u1 C 2 8
+ beq L(end)
+ li t1, 0
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subf s0, u1, v1 C 2
+ li s1, 0
+ b L(bck)
+
+L(end):
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std v1, 8(r3)
+ blr
+',`
+ mr r3, v0
+ mr r4, v1
+ blr
+')
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 0000000..9da4080
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h
@@ -0,0 +1,175 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3720 MHz POWER7/SMT4 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-10-02, gcc 4.8 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13
+#define USE_PREINV_DIVREM_1 0
+/* From gcc110.osuosl.org, 2023-07-27 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 8.45% faster than 4 */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 27
+
+#define DIV_1_VS_MUL_1_PERCENT 341
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 71
+#define MUL_TOOM44_THRESHOLD 196
+#define MUL_TOOM6H_THRESHOLD 298
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 139
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 105
+#define SQR_TOOM4_THRESHOLD 190
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 18
+#define SQRMOD_BNM1_THRESHOLD 20
+
+#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 436, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 33, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
+ { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 79,11}, { 47,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 83
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 368, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \
+ { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 84
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 35
+#define MULLO_MUL_N_THRESHOLD 9449
+#define SQRLO_BASECASE_THRESHOLD 3
+#define SQRLO_DC_THRESHOLD 119
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 33
+#define DC_DIVAPPR_Q_THRESHOLD 124
+#define DC_BDIV_QR_THRESHOLD 62
+#define DC_BDIV_Q_THRESHOLD 144
+
+#define INV_MULMOD_BNM1_THRESHOLD 67
+#define INV_NEWTON_THRESHOLD 123
+#define INV_APPR_THRESHOLD 123
+
+#define BINV_NEWTON_THRESHOLD 284
+#define REDC_1_TO_REDC_2_THRESHOLD 18
+#define REDC_2_TO_REDC_N_THRESHOLD 109
+
+#define MU_DIV_QR_THRESHOLD 1387
+#define MU_DIVAPPR_Q_THRESHOLD 1334
+#define MUPI_DIV_QR_THRESHOLD 50
+#define MU_BDIV_QR_THRESHOLD 1308
+#define MU_BDIV_Q_THRESHOLD 1499
+
+#define POWM_SEC_TABLE 1,23,121,579,642
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 1562
+#define SET_STR_PRECOMPUTE_THRESHOLD 3100
+
+#define FAC_DSC_THRESHOLD 774
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD2_DIV1_METHOD 5 /* 3.27% faster than 3 */
+#define HGCD_THRESHOLD 118
+#define HGCD_APPR_THRESHOLD 150
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 365
+#define JACOBI_BASE_METHOD 4 /* 27.64% faster than 1 */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h
new file mode 100644
index 0000000..09348e0
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h
@@ -0,0 +1,171 @@
+/* POWER8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4150 MHz POWER8/SMT4 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-09-24, gcc 7.2 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10
+#define USE_PREINV_DIVREM_1 0
+/* From gcc112.osuosl.org, 2023-07-27 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 13.00% faster than 4 */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD 1
+#define DIV_QR_2_PI2_THRESHOLD 9
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 34
+
+#define DIV_1_VS_MUL_1_PERCENT 276
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 278
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 97
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 303
+#define SQR_TOOM8_THRESHOLD 454
+
+#define MULMID_TOOM42_THRESHOLD 42
+
+#define MULMOD_BNM1_THRESHOLD 15
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 131,10}, \
+ { 79,11}, { 47,10}, { 95,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \
+ { 79,10}, { 159,11}, { 95, 8}, { 767, 7}, \
+ { 1599,11}, { 111,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,11}, { 159,12}, { 95,11}, { 191,10}, \
+ { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 80
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,11}, { 79, 9}, { 319,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 71
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 33
+#define MULLO_MUL_N_THRESHOLD 9174
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 114
+#define SQRLO_SQR_THRESHOLD 6461
+
+#define DC_DIV_QR_THRESHOLD 38
+#define DC_DIVAPPR_Q_THRESHOLD 158
+#define DC_BDIV_QR_THRESHOLD 48
+#define DC_BDIV_Q_THRESHOLD 112
+
+#define INV_MULMOD_BNM1_THRESHOLD 74
+#define INV_NEWTON_THRESHOLD 132
+#define INV_APPR_THRESHOLD 131
+
+#define BINV_NEWTON_THRESHOLD 278
+#define REDC_1_TO_REDC_2_THRESHOLD 56
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 46
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 3,19,117,672,1867
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 608
+#define SET_STR_PRECOMPUTE_THRESHOLD 2405
+
+#define FAC_DSC_THRESHOLD 164
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD2_DIV1_METHOD 1 /* 6.88% faster than 3 */
+#define HGCD_THRESHOLD 114
+#define HGCD_APPR_THRESHOLD 118
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 440
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 1 /* 0.74% faster than 4 */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm
new file mode 100644
index 0000000..53ea0e0
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm
@@ -0,0 +1,53 @@
+dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb (approximate)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 ?
+C POWER8 32
+
+C This runs on POWER7 and later, but is faster only on later CPUs.
+C We might want to inline this, considering its small footprint.
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+ sldi. r4, r3, 1
+ neg r5, r3
+ divdeu( r3, r5, r3)
+ beq- L(1)
+ blr
+L(1): li r3, -1
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm
new file mode 100644
index 0000000..2426a00
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm
@@ -0,0 +1,112 @@
+dnl PowerPC-64 mpn_add_n_sub_n optimised for POWER9.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 2.25
+
+
+C INPUT PARAMETERS
+define(`arp', `r3')
+define(`srp', `r4')
+define(`up', `r5')
+define(`vp', `r6')
+define(`n', `r7')
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+ cmpdi cr7, n, 2
+ subfo r0, r0, r0 C clear OV
+ rldicl. r9, n, 0, 63 C n & 1
+ beq cr0, L(bx0)
+
+L(bx1): ld r10, 0(up)
+ ld r11, 0(vp)
+ ble cr7, L(1)
+ srdi r7, r7, 1
+ mtctr r7
+ ld r8, 8(up)
+ ld r9, 8(vp)
+ addex( r0, r10, r11, 0)
+ subfc r12, r11, r10
+ addi up, up, -8
+ addi vp, vp, -8
+ b L(lo1)
+
+L(bx0): ld r8, 0(up)
+ ld r9, 0(vp)
+ ld r10, 8(up)
+ ld r11, 8(vp)
+ addex( r0, r8, r9, 0)
+ subfc r12, r9, r8
+ addi arp, arp, 8
+ addi srp, srp, 8
+ ble cr7, L(end)
+ addi r7, r7, -1
+ srdi r7, r7, 1
+ mtctr r7
+
+L(top): ld r8, 16(up)
+ ld r9, 16(vp)
+ std r0, -8(arp)
+ std r12, -8(srp)
+ addex( r0, r10, r11, 0)
+ subfe r12, r11, r10
+L(lo1): ld r10, 24(up)
+ ld r11, 24(vp)
+ std r0, 0(arp)
+ std r12, 0(srp)
+ addex( r0, r8, r9, 0)
+ subfe r12, r9, r8
+ addi up, up, 16
+ addi vp, vp, 16
+ addi arp, arp, 16
+ addi srp, srp, 16
+ bdnz L(top)
+
+L(end): std r0, -8(arp)
+ std r12, -8(srp)
+L(1): addex( r0, r10, r11, 0)
+ subfe r12, r11, r10
+ std r0, 0(arp)
+ std r12, 0(srp)
+ subfe r3, r3, r3
+ addex( r3, r3, r3, 0)
+ rldicl r3, r3, 1, 62
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm
new file mode 100644
index 0000000..95b8faa
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm
@@ -0,0 +1,106 @@
+dnl Power9 mpn_addaddmul_1msb0
+
+dnl Copyright 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 1-way 2-way 4-way 8-way 16-way mul_1+addmul_1
+C power9: 4.55 3.87 3.55 3.35 3.25 5.16
+
+C TODO
+C * Only WAYS = 4 currently has proper feed-in code.
+C * Try ldu/stdu to save the explicit updates.
+C * Try using madd in a long dependent chain, only breaking the recurrency
+C once per iteration.
+C * Some cycles could perhaps be saved by scheduling the crX-setting insns.
+
+define(`rp', r3)
+define(`ap', r4)
+define(`bp', r5)
+define(`n', r6)
+define(`u0', r7)
+define(`v0', r8)
+
+define(`BLOCK',`
+L(lo`'eval((WAYS-$1)%4)):
+ ld r10, eval(8*$1)(ap)
+ ld r11, eval(8*$1)(bp)
+ mulld r12, r10, u0
+ mulhdu r10, r10, u0
+ maddld( r6, r11, v0, r12)
+ maddhdu(r11, r11, v0, r12)
+ adde r12, r6, r0
+ std r12, eval(8*$1)(rp)
+ add r0, r10, r11')
+
+ifdef(`WAYS',,`define(`WAYS',4)')
+
+PROLOGUE(mpn_addaddmul_1msb0)
+ addi r10, n, WAYS-1
+ srdi r10, r10, m4_log2(WAYS)
+ mtctr r10
+ addic r0, r3, 0
+ li r0, 0
+ifelse(WAYS,4,`
+ rldicl. r9, n, 0, 63
+ rldicl r10, n, 63, 63
+ cmpdi cr7, r10, 0
+ bne cr0, L(bx1)
+
+L(bx0): beq cr7, L(lo0)
+
+L(b10): addi ap, ap, -16
+ addi bp, bp, -16
+ addi rp, rp, -16
+ b L(lo2)
+
+L(bx1): bne cr7, L(b11)
+
+L(b01): addi ap, ap, -24
+ addi bp, bp, -24
+ addi rp, rp, -24
+ b L(lo1)
+
+L(b11): addi ap, ap, -8
+ addi bp, bp, -8
+ addi rp, rp, -8
+ b L(lo3)
+')
+
+L(top): forloop(i,0,eval(WAYS-1),`BLOCK(i)')
+
+ addi ap, ap, eval(8*WAYS)
+ addi bp, bp, eval(8*WAYS)
+ addi rp, rp, eval(8*WAYS)
+ bdnz L(top)
+
+ addze r3, r0
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm
new file mode 100644
index 0000000..8f49606
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm
@@ -0,0 +1,130 @@
+dnl Power9 mpn_addmul_1.
+
+dnl Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 2.5
+
+C TODO
+C * Schedule for Power9 pipeline.
+C * Unroll 4x if that proves beneficial.
+C * This is marginally faster (but much smaller) than ../aorsmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ cmpdi cr6, n, 2
+ addi r0, n, -1 C FIXME: postpone
+ srdi r0, r0, 1 C FIXME: postpone
+ mtctr r0 C FIXME: postpone
+ rldicl. r0, n, 0,63 C r0 = n & 3, set cr0
+ bne cr0, L(b1)
+
+L(b0): ld r10, 0(rp)
+ ld r12, 0(up)
+ ld r11, 8(rp)
+ ld r0, 8(up)
+ maddld( r9, r12, v0, r10)
+ maddhdu(r7, r12, v0, r10)
+ ble cr6, L(2)
+ ld r10, 16(rp)
+ ld r12, 16(up)
+ maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ addic up, up, 16
+ addi rp, rp, -8
+ b L(mid)
+
+L(b1): ld r11, 0(rp)
+ ld r0, 0(up)
+ ble cr6, L(1)
+ ld r10, 8(rp)
+ ld r12, 8(up)
+ maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ ld r11, 16(rp)
+ ld r0, 16(up)
+ maddld( r9, r12, v0, r10)
+ maddhdu(r7, r12, v0, r10)
+ addic up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r10, 24(rp)
+ ld r12, 0(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ maddld( r8, r0, v0, r11) C W:0,2,4
+ maddhdu(r5, r0, v0, r11) C W:1,3,5
+L(mid): ld r11, 32(rp)
+ ld r0, 8(up)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ maddld( r9, r12, v0, r10) C W:1,3,5
+ maddhdu(r7, r12, v0, r10) C W:2,4,6
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(top)
+
+L(end): std r8, 0(rp)
+ maddld( r8, r0, v0, r11)
+ adde r9, r5, r9
+ maddhdu(r5, r0, v0, r11)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ std r8, 16(rp)
+ addze r3, r5
+ blr
+
+L(2): maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ std r9, 0(rp)
+ addc r8, r7, r8
+ std r8, 8(rp)
+ addze r3, r5
+ blr
+
+L(1): maddld( r8, r0, v0, r11)
+ std r8, 0(rp)
+ maddhdu(r3, r0, v0, r11)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm
new file mode 100644
index 0000000..846a894
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm
@@ -0,0 +1,193 @@
+dnl Power9 mpn_addmul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C power9: 1.62
+
+C STATUS
+C * Not written with any power9 pipeline understanding.
+C * The 4x unrolling was not motivated by any timing tests.
+C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5') C Note: Reused as scratch
+define(`vp', `r6') C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ subfic r0, r1, 0 C clear CA
+ subfo r0, r0, r0 C clear OV and r0
+
+ cmpdi cr7, n, 4
+
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+
+ srdi r10, n, 2
+ mtctr r10
+
+ rldicl. r9, n, 0, 63
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+
+ ld r28, 0(rp)
+ ld r8, 0(up)
+ ld r11, 8(rp)
+ ld r9, 8(up)
+ maddld( r26, r8, v0, r28)
+ maddhdu(r31, r8, v0, r28)
+ blt cr7, L(2)
+ ld r28, 16(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ bne cr0, L(b10)
+
+L(b00): addi up, up, -8
+ addi rp, rp, -24
+ b L(lo0)
+
+L(b10): addi up, up, 8
+ addi rp, rp, -8
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+
+ ld r29, 0(rp)
+ ld r9, 0(up)
+ ld r10, 8(rp)
+ ld r8, 8(up)
+ maddld( r27, r9, v0, r29)
+ maddhdu(r30, r9, v0, r29)
+ ld r29, 16(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 16
+ blt cr7, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r26, r8, v0, r10) C 0 4 -> adde
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r27, r0 C 7 11
+ ld r28, 24(rp)
+ std r0, 0(rp)
+ maddld( r5, r8, v1, r29) C 1 5 -> addex
+ maddhdu(r10, r8, v1, r29) C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(lo2): ld r8, 8(up)
+ maddld( r27, r9, v0, r11) C 1 5 -> adde
+ maddhdu(r30, r9, v0, r11) C 2 6
+ adde r0, r26, r0 C 8 12
+ ld r29, 32(rp)
+ std r0, 8(rp)
+ maddld( r12, r9, v1, r28) C 2 6 -> addex
+ maddhdu(r11, r9, v1, r28) C 3 7
+ addex( r0, r5, r31, 0) C 5 9 13
+L(lo1): ld r9, 16(up)
+ maddld( r26, r8, v0, r10) C 2 6 -> adde
+ maddhdu(r31, r8, v0, r10) C 3 7
+ adde r0, r27, r0 C 5 9 13
+ ld r28, 40(rp)
+ std r0, 16(rp)
+ maddld( r5, r8, v1, r29) C 3 7 -> addex
+ maddhdu(r10, r8, v1, r29) C 4 8
+ addex( r0, r12, r30, 0) C 6 10
+L(lo0): ld r8, 24(up)
+ maddld( r27, r9, v0, r11) C 3 7 -> adde
+ maddhdu(r30, r9, v0, r11) C 4 8
+ adde r0, r26, r0 C 6 10
+ ld r29, 48(rp)
+ std r0, 24(rp)
+ maddld( r12, r9, v1, r28) C 4 8 -> addex
+ maddhdu(r11, r9, v1, r28) C 5 9
+ addex( r0, r5, r31, 0) C 7 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r26, r8, v0, r10) C 0 4
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r27, r0 C 7 11
+ std r0, 0(rp) C -4
+ maddld( r5, r8, v1, r29) C 1 5
+ maddhdu(r10, r8, v1, r29) C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2
+ maddhdu(r30, r9, v0, r11) C 2 6 -1
+ adde r0, r26, r0 C 8 12 -3
+ std r0, 8(rp) C -3
+ mulld r12, r9, v1 C 2 6 -1
+ mulhdu r11, r9, v1 C 3 7 0 = return limb
+ addex( r0, r5, r31, 0) C 5 9 13
+ adde r0, r27, r0 C 5 9 13 -2
+ std r0, 16(rp) C -2
+ addex( r0, r12, r30, 0) C 6 10 -1
+ adde r0, r0, r10 C -1
+ std r0, 24(rp) C -1
+ li r4, 0
+ addze r3, r11
+ addex( r3, r3, r4, 0)
+
+L(ret): ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm
new file mode 100644
index 0000000..e4ca3a8
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm
@@ -0,0 +1,179 @@
+dnl POWER9 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 - -
+C POWER4/PPC970 - -
+C POWER5 - -
+C POWER6 - -
+C POWER7 - -
+C POWER8 - -
+C POWER9 2.63 2.63
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUBC', adde)
+ define(`ADDSUB', addc)
+ define(`func', mpn_addmul_1)
+ define(`AM', `$1')
+ define(`SM', `')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUBC', subfe)
+ define(`ADDSUB', subfc)
+ define(`func', mpn_submul_1)
+ define(`AM', `')
+ define(`SM', `$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ cmpdi cr7, n, 3
+ srdi r10, n, 2
+ mtctr r10
+ rldicl. r9, n, 0, 63
+ ld r11, 0(up)
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+AM(` subfzeo r12, n ') C ov = 0, ca = 0
+AM(` li r12, 0 ')
+SM(` subfco r12, r12, r12 ') C r12 = 0, ov = 0, ca = 1
+ ld r9, 8(up)
+ mulld r0, r11, v0
+ mulhdu r5, r11, v0
+ blt cr7, L(2)
+ ld r8, 16(up)
+ bne cr0, L(b10)
+
+L(b00): addi rp, rp, -24
+ b L(lo0)
+L(b10): addi rp, rp, -8
+ addi up, up, 16
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+AM(` subfzeo r5, n ') C ov = 0, ca = 0
+AM(` li r5, 0 ')
+SM(` subfco r5, r5, r5 ') C r5 = 0, ov = 0, ca = 1
+ blt cr7, L(1)
+ ld r8, 8(up)
+ mulld r7, r11, v0
+ mulhdu r12, r11, v0
+ ld r9, 16(up)
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ addi up, up, 8
+ b L(lo1)
+
+L(1): mulld r7, r11, v0
+ mulhdu r12, r11, v0
+ ld r11, 0(rp)
+ ADDSUB r10, r7, r11
+ std r10, 0(rp)
+AM(` addze r3, r12 ')
+SM(` subfe r0, r0, r0 ')
+SM(` sub r3, r12, r0 ')
+ blr
+
+L(b11): addi up, up, 24
+ ble cr7, L(end)
+
+ ALIGN(16)
+L(top): ld r11, 0(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ld r8, 0(up)
+ ADDSUBC r10, r7, r11
+ std r10, 0(rp)
+L(lo2): ld r11, 8(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ld r9, 8(up)
+ ADDSUBC r10, r0, r11
+ std r10, 8(rp)
+L(lo1): ld r11, 16(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ld r8, 16(up)
+ ADDSUBC r10, r7, r11
+ std r10, 16(rp)
+L(lo0): ld r11, 24(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ld r9, 24(up)
+ ADDSUBC r10, r0, r11
+ std r10, 24(rp)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r11, 0(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ADDSUBC r10, r7, r11
+ std r10, 0(rp)
+L(cj2): ld r11, 8(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ADDSUBC r10, r0, r11
+ std r10, 8(rp)
+ ld r11, 16(rp)
+ addex( r7, r7, r5, 0)
+ ADDSUBC r10, r7, r11
+ std r10, 16(rp)
+ li r0, 0
+ addex( r3, r12, r0, 0)
+AM(` addze r3, r3 ')
+SM(` subfe r0, r0, r0 ')
+SM(` sub r3, r3, r0 ')
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm
new file mode 100644
index 0000000..2dc982d
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm
@@ -0,0 +1,64 @@
+dnl PowerPC-64 mpn_gcd_11.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 5.75
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0', `r3')
+define(`v0', `r4')
+
+define(`cnt', `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+ b L(odd)
+
+ ALIGN(16)
+L(top): isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |v - u|
+ srd u0, u0, cnt
+L(odd): subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ cmpld cr7, v0, u0
+ cnttzd cnt, r10
+ bne cr7, L(top)
+
+L(end): blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm
new file mode 100644
index 0000000..12d11b0
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm
@@ -0,0 +1,143 @@
+dnl PowerPC-64 mpn_gcd_22 optimised for POWER9.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 9.58
+
+C We define SLOW if this target uses a slow struct return mechanism, with
+C r3 as an implicit parameter for the struct pointer.
+undefine(`SLOW')dnl
+ifdef(`AIX',`define(`SLOW',`due to AIX')',`
+ ifdef(`DARWIN',,`
+ ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl
+ ')
+')
+
+ifdef(`SLOW',`
+define(`IFSLOW', `$1')
+define(`u1', `r4')
+define(`u0', `r5')
+define(`v1', `r6')
+define(`v0', `r7')
+',`
+define(`IFSLOW', `')
+define(`u1', `r3')
+define(`u0', `r4')
+define(`v1', `r5')
+define(`v0', `r6')
+')
+
+define(`tmp', `r0')
+define(`t0', `r8')
+define(`t1', `r9')
+define(`s0', `r10')
+define(`s1', `r11')
+define(`cnt', `r12')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+ cmpld cr7, v0, u0
+L(top): subfc t0, v0, u0 C 0 12
+ beq cr7, L(lowz)
+ subfe t1, v1, u1 C 2 14
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subfc s0, u0, v0 C 0
+ subfe s1, u1, v1 C 2
+
+L(bck): cnttzd cnt, t0 C 2
+ subfic tmp, cnt, 64 C 4
+
+ isel v0, v0, u0, 2 C 6 use condition set by subfe
+ isel u0, t0, s0, 2 C 6
+ isel v1, v1, u1, 2 C 6
+ isel u1, t1, s1, 2 C 6
+
+ srd u0, u0, cnt C 8
+ sld tmp, u1, tmp C 8
+ srd u1, u1, cnt C 8
+ or u0, u0, tmp C 10
+
+ or. r0, u1, v1 C 10
+ cmpld cr7, v0, u0
+ bne L(top)
+
+
+ b L(odd)
+ ALIGN(16)
+L(top1):isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |u - v|
+ srd u0, u0, cnt
+L(odd): subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ cmpld cr7, v0, u0
+ cnttzd cnt, r10
+ bne cr7, L(top1)
+
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std r10, 8(r3)
+',`
+ mr r3, v0
+ li r4, 0
+')
+ blr
+
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ subfc. t0, v1, u1 C 2 8
+ beq L(end)
+ li t1, 0
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subf s0, u1, v1 C 2
+ li s1, 0
+ b L(bck)
+
+L(end):
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std v1, 8(r3)
+ blr
+',`
+ mr r3, v0
+ mr r4, v1
+ blr
+')
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h
new file mode 100644
index 0000000..f29a84e
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h
@@ -0,0 +1,254 @@
+/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2200MHz POWER9 */
+/* FFT tuning limit = 221,245,838 */
+/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 0
+/* From gcc120.osuosl.org, 2023-07-27 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 6.48% faster than 4 */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD 2
+#define DIV_QR_2_PI2_THRESHOLD 7
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 33
+
+#define DIV_1_VS_MUL_1_PERCENT 365
+
+#define MUL_TOOM22_THRESHOLD 34
+#define MUL_TOOM33_THRESHOLD 109
+#define MUL_TOOM44_THRESHOLD 458
+#define MUL_TOOM6H_THRESHOLD 517
+#define MUL_TOOM8H_THRESHOLD 608
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 292
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 204
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 158
+#define SQR_TOOM4_THRESHOLD 674
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 898
+
+#define MULMID_TOOM42_THRESHOLD 70
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 25
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 13, 5}, { 27, 6}, { 27, 7}, { 14, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 35, 8}, { 71, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,10}, \
+ { 735,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
+ { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \
+ { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \
+ { 415,11}, { 831,10}, { 1663,11}, { 863,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \
+ { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
+ { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1471,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,13}, { 895,11}, \
+ { 3583,12}, { 1919,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
+ { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 243
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \
+ { 95,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,10}, { 1407,11}, \
+ { 735,13}, { 191,12}, { 383,11}, { 767,10}, \
+ { 1535,12}, { 415,11}, { 831,12}, { 447,11}, \
+ { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \
+ { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4479,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,16}, { 1023,15}, \
+ { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5119,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 39
+#define MULLO_MUL_N_THRESHOLD 7246
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 40
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 88
+#define DC_BDIV_QR_THRESHOLD 35
+#define DC_BDIV_Q_THRESHOLD 62
+
+#define INV_MULMOD_BNM1_THRESHOLD 79
+#define INV_NEWTON_THRESHOLD 11
+#define INV_APPR_THRESHOLD 11
+
+#define BINV_NEWTON_THRESHOLD 264
+#define REDC_1_TO_REDC_2_THRESHOLD 8
+#define REDC_2_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1470
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1652
+
+#define POWM_SEC_TABLE 1,16,151,839
+
+#define GET_STR_DC_THRESHOLD 7
+#define GET_STR_PRECOMPUTE_THRESHOLD 15
+#define SET_STR_DC_THRESHOLD 406
+#define SET_STR_PRECOMPUTE_THRESHOLD 885
+
+#define FAC_DSC_THRESHOLD 179
+#define FAC_ODD_THRESHOLD 53
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 9.10% faster than 3 */
+#define HGCD_THRESHOLD 45
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 258
+#define JACOBI_BASE_METHOD 4 /* 15.45% faster than 1 */
+
+/* Tuneup completed successfully, took 179422 seconds */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm
new file mode 100644
index 0000000..363f095
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm
@@ -0,0 +1,126 @@
+dnl Power9 mpn_mul_1.
+
+dnl Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+C POWER8 ?
+C POWER9 2.47
+
+C TODO
+C * Schedule for Power9 pipeline.
+C * Unroll 4x if that proves beneficial.
+C * This is marginally faster (but much smaller) than ../mul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ li r7, 0
+L(ent): ld r11, 0(up)
+ cmpdi cr6, n, 2
+ addi r0, n, -1 C FIXME: postpone
+ srdi r0, r0, 1 C FIXME: postpone
+ mtctr r0 C FIXME: postpone
+ rldicl. r12, n, 0,63 C r0 = n & 3, set cr0
+ bne cr0, L(b1)
+
+L(b0): ld r0, 8(up)
+ maddld( r9, r11, v0, r7)
+ maddhdu(r7, r11, v0, r7)
+ ble cr6, L(2)
+ ld r12, 16(up)
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ addic up, up, 16
+ addi rp, rp, -8
+ b L(mid)
+
+L(b1): ld r0, 0(up)
+ ble cr6, L(1)
+ ld r12, 8(up)
+ maddld( r8, r11, v0, r7)
+ maddhdu(r5, r11, v0, r7)
+ ld r0, 16(up)
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addic up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r12, 0(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+L(mid): ld r0, 8(up)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(top)
+
+L(end): std r8, 0(rp)
+ mulld r8, r0, v0
+ adde r9, r5, r9
+ mulhdu r5, r0, v0
+ std r9, 8(rp)
+ adde r8, r7, r8
+ std r8, 16(rp)
+ addze r3, r5
+ blr
+
+L(2): mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ std r9, 0(rp)
+ addc r8, r7, r8
+ std r8, 8(rp)
+ addze r3, r5
+ blr
+
+L(1): maddld( r8, r0, v0, r7)
+ std r8, 0(rp)
+ maddhdu(r3, r0, v0, r7)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm
new file mode 100644
index 0000000..01b50a3
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm
@@ -0,0 +1,181 @@
+dnl Power9 mpn_mul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C power9: 1.58
+
+C STATUS
+C * Not written with any power9 pipeline understanding.
+C * The 4x unrolling was not motivated by any timing tests.
+C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5') C Note: Reused as scratch
+define(`vp', `r6') C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ subfic r0, n, 0 C clear CA
+ subfo r0, r0, r0 C clear OV and r0
+
+ cmpdi cr7, n, 4
+
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+
+ srdi r10, n, 2
+ mtctr r10
+
+ rldicl. r9, n, 0, 63
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+
+ ld r8, 0(up)
+ ld r9, 8(up)
+ li r11, 0
+ mulld r28, r8, v0
+ mulhdu r31, r8, v0
+ blt cr7, L(2)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ bne cr0, L(b10)
+
+L(b00): addi up, up, -8
+ addi rp, rp, -24
+ b L(lo0)
+
+L(b10): addi up, up, 8
+ addi rp, rp, -8
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+
+ ld r9, 0(up)
+ ld r8, 8(up)
+ li r10, 0
+ mulld r29, r9, v0
+ mulhdu r30, r9, v0
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 16
+ blt cr7, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, v0, r10) C 0 4 -> adde
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r29, r0 C 7 11
+ std r0, 0(rp)
+ mulld r5, r8, v1 C 1 5 -> addex
+ mulhdu r10, r8, v1 C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, v0, r11) C 1 5 -> adde
+ maddhdu(r30, r9, v0, r11) C 2 6
+ adde r0, r28, r0 C 8 12
+ std r0, 8(rp)
+ mulld r12, r9, v1 C 2 6 -> addex
+ mulhdu r11, r9, v1 C 3 7
+ addex( r0, r5, r31, 0) C 5 9 13
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, v0, r10) C 2 6 -> adde
+ maddhdu(r31, r8, v0, r10) C 3 7
+ adde r0, r29, r0 C 5 9 13
+ std r0, 16(rp)
+ mulld r5, r8, v1 C 3 7 -> addex
+ mulhdu r10, r8, v1 C 4 8
+ addex( r0, r12, r30, 0) C 6 10
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, v0, r11) C 3 7 -> adde
+ maddhdu(r30, r9, v0, r11) C 4 8
+ adde r0, r28, r0 C 6 10
+ std r0, 24(rp)
+ mulld r12, r9, v1 C 4 8 -> addex
+ mulhdu r11, r9, v1 C 5 9
+ addex( r0, r5, r31, 0) C 7 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, v0, r10) C 0 4
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r29, r0 C 7 11
+ std r0, 0(rp) C -4
+ mulld r5, r8, v1 C 1 5
+ mulhdu r10, r8, v1 C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2
+ maddhdu(r30, r9, v0, r11) C 2 6 -1
+ adde r0, r28, r0 C 8 12 -3
+ std r0, 8(rp) C -3
+ mulld r12, r9, v1 C 2 6 -1
+ mulhdu r11, r9, v1 C 3 7 0 = return limb
+ addex( r0, r5, r31, 0) C 5 9 13
+ adde r0, r29, r0 C 5 9 13 -2
+ std r0, 16(rp) C -2
+ addex( r0, r12, r30, 0) C 6 10 -1
+ adde r0, r0, r10 C -1
+ std r0, 24(rp) C -1
+ li r4, 0
+ addze r3, r11
+ addex( r3, r3, r4, 0)
+
+L(ret): ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm
new file mode 100644
index 0000000..8f3d322
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm
@@ -0,0 +1,415 @@
+dnl Power9 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 1.62
+
+C TODO
+C * Check if (inner) loop alignment affects performance.
+C * Could we schedule loads less in addmul_2/mul_2? That would save some regs
+C and make the tail code more manageable.
+C * Postpone some register saves to main loop.
+C * Perhaps write more small operands (3x1, 3x2, 3x3) code.
+C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2.
+C On the other hand, the current rp,up restore register are useful for OSP.
+C * Do OSP. This should save a lot with the current deep addmul_2 pipeline.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r0')
+define(`v1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r22, -80(r1)
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ mr rp2, r3 C rp
+ mr up2, r4 C up
+ srdi r22, r5, 2 C un
+ subfic r23, r7, 0 C -vn, clear CA
+ subfo r0, r0, r0 C clear OV (and r0)
+
+ cmpdi cr6, un, 3
+ rldicl r0, un, 0, 63 C r0 = un & 1
+ cmpdi cr7, r0, 0
+ rldicl r0, un, 63, 63 C FIXME: unused for vn = 1
+ cmpdi cr5, r0, 0 C FIXME: unused for vn = 1
+
+ ld v0, 0(vp)
+ rldicl. r9, vn, 0, 63
+ beq cr0, L(vn_evn)
+
+L(vn_odd):
+ addi r10, un, -2
+ ld r5, 0(up)
+ srdi r10, r10, 1
+ mtctr r10
+ bne cr7, L(m1_b1)
+
+L(m1_b0):
+ ld r10, 8(up)
+ mulld r9, r5, v0
+ mulhdu r11, r5, v0
+ ld r12, 16(up)
+ mulld r8, r10, v0
+ mulhdu r5, r10, v0
+ addi rp, rp, -8
+ b L(m1_mid)
+
+L(m1_b1):
+ ld r12, 8(up)
+ mulld r8, r5, v0
+ mulhdu r5, r5, v0
+ ld r10, 16(up)
+ mulld r9, r12, v0
+ mulhdu r11, r12, v0
+ addi up, up, 8
+ beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3}
+
+ ALIGN(16)
+L(m1_top):
+ ld r12, 16(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ mulld r8, r10, v0
+ mulhdu r5, r10, v0
+L(m1_mid):
+ ld r10, 24(up)
+ std r9, 8(rp)
+ adde r8, r11, r8
+ mulld r9, r12, v0
+ mulhdu r11, r12, v0
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(m1_top)
+
+L(m1_end):
+ std r8, 0(rp)
+ mulld r8, r10, v0
+ adde r9, r5, r9
+ mulhdu r5, r10, v0
+ std r9, 8(rp)
+ adde r8, r11, r8
+ std r8, 16(rp)
+ addze r10, r5
+ std r10, 24(rp)
+
+ addi rp2, rp2, 8
+ addi vp, vp, 8
+ addic. r23, r23, 1
+ b L(do_outer)
+
+L(vn_evn):
+ ld v1, 8(vp)
+ addi r23, r23, 2
+ mtctr r22
+ bne cr7, L(m2_bx1)
+
+L(m2_bx0):
+ ld r8, 0(up)
+ ld r9, 8(up)
+ li r11, 0
+ mulld r28, r8, v0
+ mulhdu r31, r8, v0
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ li r12, 0
+ bne cr5, L(m2_b10)
+
+L(m2_b00):
+ addi up, up, -8
+ addi rp, rp, -24
+ b L(m2_lo0)
+
+L(m2_b10):
+ addi up, up, 8
+ addi rp, rp, -8
+ b L(m2_lo2)
+
+L(m2_bx1):
+ ld r9, 0(up)
+ ld r8, 8(up)
+ li r10, 0
+ mulld r29, r9, v0
+ mulhdu r30, r9, v0
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ li r5, 0
+ bne cr5, L(m2_b11)
+
+L(m2_b01):
+ addi rp, rp, -16
+ b L(m2_lo1)
+L(m2_b11):
+ addi up, up, 16
+ beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done.
+
+L(m2_top):
+ ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ addex( r12, r12, r30, 0)
+L(m2_lo2):
+ ld r8, 8(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+L(m2_lo1):
+ ld r9, 16(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ addex( r12, r12, r30, 0)
+L(m2_lo0):
+ ld r8, 24(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 24(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(m2_top)
+
+L(m2_end):
+ ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj)
+
+L(outer):
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+ addi r23, r23, 2
+ mtctr r22
+ bne cr7, L(bx1)
+
+L(bx0): ld r26, 0(rp2)
+ ld r8, 0(up2)
+ ld r11, 8(rp2)
+ ld r9, 8(up2)
+ maddld( r28, r8, v0, r26)
+ maddhdu(r31, r8, v0, r26)
+ ld r26, 16(rp2)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ li r12, 0
+ bne cr5, L(b10)
+
+L(b00): addi up, up2, -8
+ addi rp, rp2, -24
+ b L(lo0)
+
+L(b10): addi up, up2, 8
+ addi rp, rp2, -8
+ b L(lo2)
+
+L(bx1): ld r27, 0(rp2)
+ ld r9, 0(up2)
+ ld r10, 8(rp2)
+ ld r8, 8(up2)
+ maddld( r29, r9, v0, r27)
+ maddhdu(r30, r9, v0, r27)
+ ld r27, 16(rp2)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ li r5, 0
+ bne cr5, L(b11)
+
+L(b01): addi up, up2, 0
+ addi rp, rp2, -16
+ b L(lo1)
+L(b11): addi up, up2, 16
+ addi rp, rp2, 0
+ beq cr6, L(end) C taken means un = 3, vn = 3. We're done.
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ ld r26, 24(rp)
+ std r5, 0(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+ addex( r12, r12, r30, 0)
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ ld r27, 32(rp)
+ std r12, 8(rp)
+ maddld( r12, r9, v1, r26)
+ maddhdu(r11, r9, v1, r26)
+ addex( r5, r5, r31, 0)
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ ld r26, 40(rp)
+ std r5, 16(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+ addex( r12, r12, r30, 0)
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ ld r27, 48(rp)
+ std r12, 24(rp)
+ maddld( r12, r9, v1, r26)
+ maddhdu(r11, r9, v1, r26)
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+L(cj): addex( r12, r12, r30, 0)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 24(rp)
+ li r4, 0
+ addze r5, r11
+ addex( r5, r5, r4, 0)
+ std r5, 32(rp)
+
+ cmpdi cr0, r23, 0
+ addi rp2, rp2, 16
+ addi vp, vp, 16
+L(do_outer):
+ bne cr0, L(outer)
+L(ret):
+ ld r22, -80(r1)
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm
new file mode 100644
index 0000000..2d4fa63
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm
@@ -0,0 +1,555 @@
+dnl Power9 mpn_sqr_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 1.62
+
+C TODO
+C * Completely separate evn and odd code into two outer loops. Also consider
+C unrolling these two outer loops and thereby eliminate all branches.
+C * Avoid the reloading of u1 before every loop start.
+C * Reduce register usage.
+C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
+C * Consider skewing conditional adjustments to allow mask creation with subfe
+C like in the un=3 code. It might streamline the adjustments (or not).
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+
+define(`u0', `r0')
+define(`u1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+define(`cy', `r6')
+
+define(`LSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1U',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1UF',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+')
+define(`LSHU1UHF',`
+ add u0, u0, u0
+ add u0, u0, cy
+')
+C These are cleverer replacements, but they tend to leave CA set, disturbing
+C the main accumulation code! Breaking that false dependency might have a
+C positive performance impact. Note that the subfe here results in a mask for
+C our adjustments.
+define(`xLSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ ld r0, 0(up) C n = 1
+ mulld r8, r0, r0 C weight 0
+ mulhdu r9, r0, r0 C weight 1
+ std r8, 0(rp)
+ cmpdi cr0, un, 2
+ bge cr0, L(ge2)
+ std r9, 8(rp)
+ blr
+
+L(ge2): bgt cr0, L(gt2)
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+L(gt2): cmpdi cr0, un, 3
+ bgt cr0, L(gt3)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ subfo r12, r12, r12 C clear OV (and result register)
+ ld r8, 8(r4)
+ mulld r5, r8, r8 C W2
+ mulhdu r10, r8, r8 C W3
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, r8 C W3
+ addc u0, u0, u0
+ adde u1, r8, r8
+ subfe r6, r6, r6 C mask
+ ld r4, 16(r4) C W2
+ mulld r12, r8, u0 C W1 u1 x u0
+ mulhdu r8, r8, u0 C W2 u1 x u0
+ maddld( r31, r4, u0, r11) C W2
+ maddhdu(r30, r4, u0, r11) C W3
+ andc r6, r4, r6 C W4
+ addc r9, r12, r9 C W1
+ std r9, 8(rp) C W1
+ mulld r9, r4, u1 C W3
+ mulhdu r11, r4, u1 C W4
+ addex( r5, r5, r8, 0) C W2
+ adde r5, r31, r5 C W2
+ std r5, 16(rp) C W2
+ maddld( r5, r4, r4, r6) C W4 u2^2
+ maddhdu(r6, r4, r4, r6) C W5 u2^2
+ addex( r9, r9, r30, 0) C W3
+ adde r9, r9, r10 C W3
+ std r9, 24(rp) C W3
+ adde r5, r5, r11 C W4
+ addze r6, r6 C W5
+ li r8, 0
+ addex( r5, r5, r8, 0) C W4
+ std r5, 32(rp) C W4
+ addex( r6, r6, r8, 0) C W5
+ std r6, 40(rp) C W5
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+
+L(gt3): std r22, -80(r1)
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ mr rp2, rp
+ mr up2, up
+ addi r22, un, -1 C count for loop FIXME: Adjust
+ subfo r0, r0, r0 C clear OV (and r0)
+ rldicl r0, un, 0, 63 C r0 = un & 1
+ cmpdi cr7, r0, 0
+
+ ld u0, 0(up2)
+ ld u1, 8(up2)
+
+ cmpdi cr5, r22, 4
+ srdi r31, r22, 2
+ addi r22, r22, -2
+ mtctr r31
+
+ beq cr7, L(m2_evn)
+L(m2_odd):
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ mulld r23, u0, u0
+ mulhdu r12, u0, u0
+ mulld r5, u1, u1
+ mulhdu r10, u1, u1
+
+ sradi r11, u0, 63
+ and r11, r11, u1
+
+ LSHU1U0
+
+ ld r8, 8(up2)
+ ld r9, 16(up2)
+ mulld r28, r8, u0 C W u1 x u0
+ mulhdu r31, r8, u0 C W u1 x u0
+ std r23, 0(rp2)
+
+ bne cr0, L(m2_11)
+L(m2_01):
+ addi up, up2, 16
+ addi rp, rp2, 0
+ b L(m2_lo2)
+L(m2_11):
+ addi up, up2, 0
+ addi rp, rp2, -16
+ b L(m2_lo0)
+
+L(m2_evn):
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ mulld r23, u0, u0
+ mulhdu r5, u0, u0
+ mulld r12, u1, u1
+ mulhdu r11, u1, u1
+
+ sradi r10, u0, 63
+ and r10, r10, u1
+
+ LSHU1U0
+
+ ld r9, 8(up2)
+ ld r8, 16(up2)
+ mulld r29, r9, u0 C W u1 x u0
+ mulhdu r30, r9, u0 C W u1 x u0
+ std r23, 0(rp2)
+
+ beq cr0, L(m2_10)
+L(m2_00):
+ addi up, up2, 8
+ addi rp, rp2, -8
+ b L(m2_lo1)
+L(m2_10):
+ addi up, up2, 24
+ addi rp, rp2, 8
+ ble cr5, L(m2_end)
+
+L(m2_top):
+ ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ addex( r12, r12, r30, 0)
+L(m2_lo2):
+ ld r8, 8(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+L(m2_lo1):
+ ld r9, 16(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ addex( r12, r12, r30, 0)
+L(m2_lo0):
+ ld r8, 24(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 24(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(m2_top)
+
+L(m2_end):
+ ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ b L(cj) C jump to addmul_2 tail
+
+L(outer):
+ addi up2, up2, 16
+ addi rp2, rp2, 32
+
+ ld u0, 0(up2)
+ ld u1, 8(up2)
+
+ cmpdi cr5, r22, 4
+ srdi r31, r22, 2
+ addi r22, r22, -2
+ mtctr r31
+
+ ld r26, 0(rp2)
+ ld r27, 16(rp2)
+
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ beq cr7, L(evn)
+
+L(odd): maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r12, u0, u0, r26) C W u2^2
+ maddld( r5, u1, u1, r27) C W u3^2
+ maddhdu(r10, u1, u1, r27) C W u3^2
+ ld r26, 8(rp2)
+
+ ld r8, -8(up2)
+ sradi r8, r8, 63 C CAUTION: clobbers CA
+ and r8, r8, u0
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, u1
+
+ LSHU1U
+
+ addc r23, r23, r8
+
+ ld r8, 8(up2)
+ ld r9, 16(up2)
+ maddld( r28, r8, u0, r26) C W u3 x u2
+ maddhdu(r31, r8, u0, r26) C W u3 x u2
+ ld r26, 24(rp2)
+ std r23, 0(rp2) C W0
+
+ bne cr0, L(11)
+L(01):
+ addi up, up2, 16
+ addi rp, rp2, 0
+ b L(lo2)
+L(11):
+ addi up, up2, 0
+ addi rp, rp2, -16
+ b L(lo0)
+
+L(evn): maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r5, u0, u0, r26) C W u2^2
+ maddld( r12, u1, u1, r27) C W u3^2
+ maddhdu(r11, u1, u1, r27) C W u3^2
+ ld r27, 8(rp2)
+
+ ld r9, -8(up2)
+ sradi r9, r9, 63 C CAUTION: clobbers CA
+ and r9, r9, u0
+ sradi r10, u0, 63 C CAUTION: clobbers CA
+ and r10, r10, u1
+
+ LSHU1U
+
+ addc r23, r23, r9
+
+ ld r9, 8(up2)
+ ld r8, 16(up2)
+ maddld( r29, r9, u0, r27) C W u3 x u2
+ maddhdu(r30, r9, u0, r27) C W u3 x u2
+ ld r27, 24(rp2)
+ std r23, 0(rp2) C W0
+
+ beq cr0, L(10)
+L(00):
+ addi up, up2, 8
+ addi rp, rp2, -8
+ b L(lo1)
+L(10):
+ addi up, up2, 24
+ addi rp, rp2, 8
+ ble cr5, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ ld r26, 24(rp)
+ std r5, 0(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+ addex( r12, r12, r30, 0)
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ ld r27, 32(rp)
+ std r12, 8(rp)
+ maddld( r12, r9, u1, r26)
+ maddhdu(r11, r9, u1, r26)
+ addex( r5, r5, r31, 0)
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ ld r26, 40(rp)
+ std r5, 16(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+ addex( r12, r12, r30, 0)
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ ld r27, 48(rp)
+ std r12, 24(rp)
+ maddld( r12, r9, u1, r26)
+ maddhdu(r11, r9, u1, r26)
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+L(cj): addex( r12, r12, r30, 0)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 24(rp)
+ li r4, 0
+ addze r5, r11
+ addex( r5, r5, r4, 0)
+ std r5, 32(rp)
+ bgt cr5, L(outer)
+
+L(corner):
+ ld u0, 16(up2)
+ ld u1, 24(up2)
+ ld r26, 32(rp2)
+ bne cr7, L(corner_odd)
+
+L(corner_evn):
+ ld r27, 40(rp2)
+ maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r5, u0, u0, r26) C W u2^2
+ mulld r12, u1, u1 C W u3^2
+ mulhdu r11, u1, u1 C W u3^2
+
+ ld r9, 8(up2)
+ sradi r9, r9, 63 C CAUTION: clobbers CA
+ and r9, r9, u0
+ sradi r10, u0, 63 C CAUTION: clobbers CA
+ and r10, r10, u1
+
+ LSHU1UHF
+
+ addc r23, r23, r9
+
+ ld r9, 24(up2)
+ maddld( r29, r9, u0, r27) C W u3 x u2
+ maddhdu(r30, r9, u0, r27) C W u3 x u2
+ std r23, 32(rp2)
+ adde r5, r29, r5
+ std r5, 40(rp2)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10 C W FIXME can this co?
+ std r12, 48(rp2)
+ li r4, 0
+ addex( r5, r11, r4, 0)
+ addze r5, r5
+ std r5, 56(rp2)
+ b L(ret)
+
+L(corner_odd):
+ ld r27, 48(rp2)
+ maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r12, u0, u0, r26) C W u2^2
+ maddld( r5, u1, u1, r27) C W u3^2
+ maddhdu(r10, u1, u1, r27) C W u3^2
+ ld r26, 40(rp2)
+
+ ld r8, 8(up2)
+ sradi r8, r8, 63 C CAUTION: clobbers CA
+ and r8, r8, u0
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, u1
+
+ LSHU1UF
+
+ addc r23, r23, r8
+
+ ld r8, 24(up2)
+ ld r9, 32(up2)
+ maddld( r28, r8, u0, r26) C W u3 x u2
+ maddhdu(r31, r8, u0, r26) C W u3 x u2
+ std r23, 32(rp2)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 40(rp2)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 48(rp2)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 56(rp2)
+ mulld r23, r9, r9 C W u2^2
+ mulhdu r12, r9, r9 C W u2^2
+ adde r23, r23, r11
+ addze r12, r12
+ sradi r4, r8, 63 C CAUTION: clobbers CA
+ and r4, r4, r9
+ addex( r23, r23, r4, 0)
+ std r23, 64(rp2)
+ li r4, 0
+ addex( r12, r12, r4, 0)
+ std r12, 72(rp2)
+
+L(ret): ld r22, -80(r1)
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm
new file mode 100644
index 0000000..1f57bdf
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm
@@ -0,0 +1,173 @@
+dnl PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n
+
+dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 2.9
+C POWER5 ?
+C POWER6 3.5
+C POWER7 2.25
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUBC', `addc')
+ define(`ADDSUBE', `adde')
+ define(INITCY, `addic $1, r1, 0')
+ define(`func', mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUBC', `subfc')
+ define(`ADDSUBE', `subfe')
+ define(INITCY, `addic $1, r1, -1')
+ define(`func', mpn_rsh1sub_n)')
+
+define(`s0', `r9')
+define(`s1', `r7')
+define(`x0', `r0')
+define(`x1', `r12')
+define(`u0', `r8')
+define(`v0', `r10')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ ld u0, 0(up)
+ ld v0, 0(vp)
+
+ cmpdi cr6, n, 2
+
+ addi r0, n, 1
+ srdi r0, r0, 2
+ mtctr r0 C copy size to count register
+
+ andi. r0, n, 1
+ bne cr0, L(bx1)
+
+L(bx0): ADDSUBC x1, v0, u0
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ ADDSUBE x0, v0, u0
+ ble cr6, L(n2)
+ ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s0, x1, 1
+ rldicl r11, x1, 0, 63 C return value
+ ADDSUBE x1, v0, u0
+ andi. n, n, 2
+ bne cr0, L(b10)
+L(b00): addi rp, rp, -24
+ b L(lo0)
+L(b10): addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, -8
+ b L(lo2)
+
+ ALIGN(16)
+L(bx1): ADDSUBC x0, v0, u0
+ ble cr6, L(n1)
+ ld u0, 8(up)
+ ld v0, 8(vp)
+ ADDSUBE x1, v0, u0
+ ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s1, x0, 1
+ rldicl r11, x0, 0, 63 C return value
+ ADDSUBE x0, v0, u0
+ andi. n, n, 2
+ bne cr0, L(b11)
+L(b01): addi up, up, 8
+ addi vp, vp, 8
+ addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 24
+ addi vp, vp, 24
+ bdz L(end)
+
+ ALIGN(32)
+L(top): ld u0, 0(up)
+ ld v0, 0(vp)
+ srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+ ADDSUBE x1, v0, u0
+L(lo2): ld u0, 8(up)
+ ld v0, 8(vp)
+ srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 8(rp)
+ ADDSUBE x0, v0, u0
+L(lo1): ld u0, 16(up)
+ ld v0, 16(vp)
+ srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 16(rp)
+ ADDSUBE x1, v0, u0
+L(lo0): ld u0, 24(up)
+ ld v0, 24(vp)
+ srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 24(rp)
+ ADDSUBE x0, v0, u0
+ addi up, up, 32
+ addi vp, vp, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): srdi s0, x1, 1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+L(cj2): srdi s1, x0, 1
+ rldimi s0, x0, 63, 0
+ std s0, 8(rp)
+L(cj1): ADDSUBE x1, x1, x1 C pseudo-depends on x1
+ rldimi s1, x1, 63, 0
+ std s1, 16(rp)
+ mr r3, r11
+ blr
+
+L(n1): srdi s1, x0, 1
+ rldicl r11, x0, 0, 63 C return value
+ ADDSUBE x1, x1, x1 C pseudo-depends on x1
+ rldimi s1, x1, 63, 0
+ std s1, 0(rp)
+ mr r3, r11
+ blr
+
+L(n2): addi rp, rp, -8
+ srdi s0, x1, 1
+ rldicl r11, x1, 0, 63 C return value
+ b L(cj2)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 0000000..e76bb88
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm
@@ -0,0 +1,863 @@
+dnl PowerPC-64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 6-18
+C POWER4/PPC970 8
+C POWER5 8
+C POWER6 16.25
+C POWER7 3.77
+
+C NOTES
+C * This is very crude, cleanup!
+C * Try to reduce the number of needed live registers.
+C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
+C cost will be more live registers.
+C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C size a lot and speed things up perhaps 25%.
+C * Use computed goto in order to compress the code.
+C * Implement a larger final corner.
+C * Schedule callee-saves register saves into other insns. This could save
+C about 5 cycles/call. (We cannot analogously optimise the restores, since
+C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C * Should the alternating std/adde sequences be split? Some pipelines handle
+C adde poorly, and might sequentialise all these instructions.
+C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C adjacent integer multiply insns. Except for the multiply insns, the code
+C was not carefully optimised for POWER6 or any other CPU.
+C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved', `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ cmpdi cr0, n, 2
+ bge cr0, L(ge2)
+ ld r5, 0(up) C n = 1
+ nop
+ mulld r8, r5, r5 C weight 0
+ mulhdu r9, r5, r5 C weight 1
+ std r8, 0(rp)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(ge2): bgt cr0, L(gt2)
+ ld r0, 0(up) C n = 2
+ nop
+ mulld r8, r0, r0 C u0 * u0
+ mulhdu r9, r0, r0 C u0 * u0
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r8, 0(rp)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+ ALIGN(16)
+L(gt2): std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+ std r22, -80(r1)
+ std r21, -88(r1)
+
+ mr rp_saved, rp
+ mr up_saved, up
+ mr n_saved, n
+ mr rp_outer, rp
+ mr up_outer, up
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addic r7, n, 2 C compute count...
+ srdi r7, r7, 2 C ...for ctr
+ mtctr r7 C copy count into ctr
+ beq- cr0, L(b0)
+ blt- cr6, L(b1)
+ beq- cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ addi up, up, 24
+ li r12, 0 C carry limb
+ bdz L(em3)
+
+ ALIGN(16)
+L(tm3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm3)
+
+L(em3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop)
+
+L(b0): ld r6, 0(up)
+ ld r27, 8(up)
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ std r7, 8(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(em0)
+
+ ALIGN(16)
+L(tm0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm0)
+
+L(em0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_2)
+
+L(b1): ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r12, r27, r6
+ addc r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(em1)
+
+ ALIGN(16)
+L(tm1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm1)
+
+L(em1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_3)
+
+L(b2): addi r7, r7, -1 C FIXME
+ mtctr r7 C FIXME
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ std r0, 8(rp)
+ std r7, 16(rp)
+ std r11, 24(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(em2)
+
+ ALIGN(16)
+L(tm2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 0(up)
+ ld r27, 8(up)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6
+ ld r9, 16(up)
+ ld r27, 24(up)
+ std r0, 8(rp)
+ adde r26, r26, r8
+ std r7, 16(rp)
+ adde r11, r11, r10
+ std r26, 24(rp)
+ addi up, up, 32
+ std r11, 32(rp)
+ addi rp, rp, 32
+ bdnz L(tm2)
+
+L(em2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ adde r0, r0, r12
+ adde r7, r7, r26
+ std r0, 8(rp)
+ std r7, 16(rp)
+ addze r8, r8
+ std r8, 24(rp)
+ addi n, n, 2
+ b L(outer_loop_ent_0)
+
+
+L(outer_loop):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ bdz L(outer_end)
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r9, 24(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ ld r30, 16(rp)
+ mulld r11, r9, r6
+ mulhdu r10, r9, r6
+ addc r7, r7, r26
+ adde r11, r11, r8
+ addze r12, r10
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ adde r11, r11, r30
+ std r11, 16(rp)
+ addi rp, rp, 24
+ ld r9, 32(up)
+ ld r27, 40(up)
+ addi up, up, 48
+ bdz L(ea1)
+
+ ALIGN(16)
+L(ta1): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta1)
+
+L(ea1): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_0):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ addc r0, r0, r28
+ adde r7, r7, r26
+ addze r12, r8
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addi rp, rp, 16
+ ld r9, 24(up)
+ ld r27, 32(up)
+ addi up, up, 40
+ bdz L(ea0)
+
+ ALIGN(16)
+L(ta0): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta0)
+
+L(ea0): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+L(outer_loop_ent_3):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r28, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r12, r9, r6
+ addc r0, r0, r28
+ std r0, 0(rp)
+ addi rp, rp, 8
+ ld r9, 16(up)
+ ld r27, 24(up)
+ addi up, up, 32
+ bdz L(ea3)
+
+ ALIGN(16)
+L(ta3): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta3)
+
+L(ea3): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+ addi n, n, -1
+ addi up_outer, up_outer, 8
+ addi rp_outer, rp_outer, 16
+
+ mr up, up_outer
+ addi rp, rp_outer, 8
+
+ srdi r0, n, 2
+ mtctr r0
+
+ addic r0, r0, 0
+ li r12, 0 C cy_limb = 0
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r27, 16(up)
+ bdz L(ea2)
+ addi up, up, 24
+
+ ALIGN(16)
+L(ta2): mulld r0, r9, r6
+ mulhdu r26, r9, r6 C 9
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6 C 27
+ ld r9, 0(up)
+ ld r28, 0(rp)
+ ld r27, 8(up)
+ ld r29, 8(rp)
+ adde r0, r0, r12 C 0 12
+ adde r7, r7, r26 C 5 7
+ mulld r26, r9, r6
+ mulhdu r10, r9, r6 C 9
+ mulld r11, r27, r6
+ mulhdu r12, r27, r6 C 27
+ ld r9, 16(up)
+ ld r30, 16(rp)
+ ld r27, 24(up)
+ ld r31, 24(rp)
+ adde r26, r26, r8 C 8 5
+ adde r11, r11, r10 C 10 11
+ addze r12, r12 C 12
+ addc r0, r0, r28 C 0 28
+ std r0, 0(rp) C 0
+ adde r7, r7, r29 C 7 29
+ std r7, 8(rp) C 7
+ adde r26, r26, r30 C 5 30
+ std r26, 16(rp) C 5
+ adde r11, r11, r31 C 11 31
+ std r11, 24(rp) C 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(ta2)
+
+L(ea2): mulld r0, r9, r6
+ mulhdu r26, r9, r6
+ mulld r7, r27, r6
+ mulhdu r8, r27, r6
+ ld r28, 0(rp)
+ ld r29, 8(rp)
+ adde r0, r0, r12
+ adde r7, r7, r26
+ addze r8, r8
+ addc r0, r0, r28
+ std r0, 0(rp)
+ adde r7, r7, r29
+ std r7, 8(rp)
+ addze r8, r8
+ std r8, 16(rp)
+
+ b L(outer_loop)
+
+L(outer_end):
+ ld r6, 0(up)
+ ld r9, 8(up)
+ ld r11, 0(rp)
+ mulld r0, r9, r6
+ mulhdu r8, r9, r6
+ addc r0, r0, r11
+ std r0, 0(rp)
+ addze r8, r8
+ std r8, 8(rp)
+
+define(`rp', `rp_saved')
+define(`up', `r5')
+define(`n', `r6')
+define(`climb', `r0')
+
+ addi r4, rp_saved, 8
+ mr r5, up_saved
+ mr r6, n_saved
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(xb0)
+ blt cr6, L(xb1)
+ beq cr6, L(xb2)
+
+L(xb3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(xb2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(xb0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 8(rp)
+ ld r11, 16(rp)
+ ld r6, 24(rp)
+ ld r7, 32(rp)
+ ld r12, 40(rp)
+ ld r23, 48(rp)
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(xb1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r8, 0(rp)
+ ld r9, 8(rp)
+ adde r8, r8, r8
+ adde r9, r9, r9
+ ld r10, 16(rp)
+ ld r11, 24(rp)
+ adde r10, r10, r10
+ adde r11, r11, r11
+ ld r6, 32(rp)
+ ld r7, 40(rp)
+ adde r6, r6, r6
+ adde r7, r7, r7
+ ld r12, 48(rp)
+ ld r23, 56(rp)
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze r31, r31
+ addc r8, r8, climb
+ std r8, 0(rp)
+ adde r9, r9, r24
+ std r9, 8(rp)
+ adde r10, r10, r25
+ std r10, 16(rp)
+ adde r11, r11, r26
+ std r11, 24(rp)
+ adde r6, r6, r27
+ std r6, 32(rp)
+ adde r7, r7, r28
+ std r7, 40(rp)
+ adde r12, r12, r29
+ std r12, 48(rp)
+ adde r23, r23, r30
+ std r23, 56(rp)
+ mr climb, r31
+ addi rp, rp, 64
+ bdnz L(top)
+
+L(end): addze climb, climb
+ std climb, 0(rp)
+
+ ld r31, -8(r1)
+ ld r30, -16(r1)
+ ld r29, -24(r1)
+ ld r28, -32(r1)
+ ld r27, -40(r1)
+ ld r26, -48(r1)
+ ld r25, -56(r1)
+ ld r24, -64(r1)
+ ld r23, -72(r1)
+ ld r22, -80(r1)
+ ld r21, -88(r1)
+ blr
+EPILOGUE()