aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/powerpc64/mode64/p9
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/powerpc64/mode64/p9')
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm112
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm106
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm130
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm193
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm179
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm64
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm143
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h254
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm126
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm181
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm415
-rw-r--r--gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm555
12 files changed, 2458 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm
new file mode 100644
index 0000000..2426a00
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm
@@ -0,0 +1,112 @@
+dnl PowerPC-64 mpn_add_n_sub_n optimised for POWER9.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 2.25
+
+
+C INPUT PARAMETERS
+define(`arp', `r3')
+define(`srp', `r4')
+define(`up', `r5')
+define(`vp', `r6')
+define(`n', `r7')
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+ cmpdi cr7, n, 2
+ subfo r0, r0, r0 C clear OV
+ rldicl. r9, n, 0, 63 C n & 1
+ beq cr0, L(bx0)
+
+L(bx1): ld r10, 0(up)
+ ld r11, 0(vp)
+ ble cr7, L(1)
+ srdi r7, r7, 1
+ mtctr r7
+ ld r8, 8(up)
+ ld r9, 8(vp)
+ addex( r0, r10, r11, 0)
+ subfc r12, r11, r10
+ addi up, up, -8
+ addi vp, vp, -8
+ b L(lo1)
+
+L(bx0): ld r8, 0(up)
+ ld r9, 0(vp)
+ ld r10, 8(up)
+ ld r11, 8(vp)
+ addex( r0, r8, r9, 0)
+ subfc r12, r9, r8
+ addi arp, arp, 8
+ addi srp, srp, 8
+ ble cr7, L(end)
+ addi r7, r7, -1
+ srdi r7, r7, 1
+ mtctr r7
+
+L(top): ld r8, 16(up)
+ ld r9, 16(vp)
+ std r0, -8(arp)
+ std r12, -8(srp)
+ addex( r0, r10, r11, 0)
+ subfe r12, r11, r10
+L(lo1): ld r10, 24(up)
+ ld r11, 24(vp)
+ std r0, 0(arp)
+ std r12, 0(srp)
+ addex( r0, r8, r9, 0)
+ subfe r12, r9, r8
+ addi up, up, 16
+ addi vp, vp, 16
+ addi arp, arp, 16
+ addi srp, srp, 16
+ bdnz L(top)
+
+L(end): std r0, -8(arp)
+ std r12, -8(srp)
+L(1): addex( r0, r10, r11, 0)
+ subfe r12, r11, r10
+ std r0, 0(arp)
+ std r12, 0(srp)
+ subfe r3, r3, r3
+ addex( r3, r3, r3, 0)
+ rldicl r3, r3, 1, 62
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm
new file mode 100644
index 0000000..95b8faa
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm
@@ -0,0 +1,106 @@
+dnl Power9 mpn_addaddmul_1msb0
+
+dnl Copyright 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C 1-way 2-way 4-way 8-way 16-way mul_1+addmul_1
+C power9: 4.55 3.87 3.55 3.35 3.25 5.16
+
+C TODO
+C * Only WAYS = 4 currently has proper feed-in code.
+C * Try ldu/stdu to save the explicit updates.
+C * Try using madd in a long dependent chain, only breaking the recurrency
+C once per iteration.
+C * Some cycles could perhaps be saved by scheduling the crX-setting insns.
+
+define(`rp', r3)
+define(`ap', r4)
+define(`bp', r5)
+define(`n', r6)
+define(`u0', r7)
+define(`v0', r8)
+
+define(`BLOCK',`
+L(lo`'eval((WAYS-$1)%4)):
+ ld r10, eval(8*$1)(ap)
+ ld r11, eval(8*$1)(bp)
+ mulld r12, r10, u0
+ mulhdu r10, r10, u0
+ maddld( r6, r11, v0, r12)
+ maddhdu(r11, r11, v0, r12)
+ adde r12, r6, r0
+ std r12, eval(8*$1)(rp)
+ add r0, r10, r11')
+
+ifdef(`WAYS',,`define(`WAYS',4)')
+
+PROLOGUE(mpn_addaddmul_1msb0)
+ addi r10, n, WAYS-1
+ srdi r10, r10, m4_log2(WAYS)
+ mtctr r10
+ addic r0, r3, 0
+ li r0, 0
+ifelse(WAYS,4,`
+ rldicl. r9, n, 0, 63
+ rldicl r10, n, 63, 63
+ cmpdi cr7, r10, 0
+ bne cr0, L(bx1)
+
+L(bx0): beq cr7, L(lo0)
+
+L(b10): addi ap, ap, -16
+ addi bp, bp, -16
+ addi rp, rp, -16
+ b L(lo2)
+
+L(bx1): bne cr7, L(b11)
+
+L(b01): addi ap, ap, -24
+ addi bp, bp, -24
+ addi rp, rp, -24
+ b L(lo1)
+
+L(b11): addi ap, ap, -8
+ addi bp, bp, -8
+ addi rp, rp, -8
+ b L(lo3)
+')
+
+L(top): forloop(i,0,eval(WAYS-1),`BLOCK(i)')
+
+ addi ap, ap, eval(8*WAYS)
+ addi bp, bp, eval(8*WAYS)
+ addi rp, rp, eval(8*WAYS)
+ bdnz L(top)
+
+ addze r3, r0
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm
new file mode 100644
index 0000000..8f49606
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm
@@ -0,0 +1,130 @@
+dnl Power9 mpn_addmul_1.
+
+dnl Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 2.5
+
+C TODO
+C * Schedule for Power9 pipeline.
+C * Unroll 4x if that proves beneficial.
+C * This is marginally faster (but much smaller) than ../aorsmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ cmpdi cr6, n, 2
+ addi r0, n, -1 C FIXME: postpone
+ srdi r0, r0, 1 C FIXME: postpone
+ mtctr r0 C FIXME: postpone
+ rldicl. r0, n, 0,63 C r0 = n & 3, set cr0
+ bne cr0, L(b1)
+
+L(b0): ld r10, 0(rp)
+ ld r12, 0(up)
+ ld r11, 8(rp)
+ ld r0, 8(up)
+ maddld( r9, r12, v0, r10)
+ maddhdu(r7, r12, v0, r10)
+ ble cr6, L(2)
+ ld r10, 16(rp)
+ ld r12, 16(up)
+ maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ addic up, up, 16
+ addi rp, rp, -8
+ b L(mid)
+
+L(b1): ld r11, 0(rp)
+ ld r0, 0(up)
+ ble cr6, L(1)
+ ld r10, 8(rp)
+ ld r12, 8(up)
+ maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ ld r11, 16(rp)
+ ld r0, 16(up)
+ maddld( r9, r12, v0, r10)
+ maddhdu(r7, r12, v0, r10)
+ addic up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r10, 24(rp)
+ ld r12, 0(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ maddld( r8, r0, v0, r11) C W:0,2,4
+ maddhdu(r5, r0, v0, r11) C W:1,3,5
+L(mid): ld r11, 32(rp)
+ ld r0, 8(up)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ maddld( r9, r12, v0, r10) C W:1,3,5
+ maddhdu(r7, r12, v0, r10) C W:2,4,6
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(top)
+
+L(end): std r8, 0(rp)
+ maddld( r8, r0, v0, r11)
+ adde r9, r5, r9
+ maddhdu(r5, r0, v0, r11)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ std r8, 16(rp)
+ addze r3, r5
+ blr
+
+L(2): maddld( r8, r0, v0, r11)
+ maddhdu(r5, r0, v0, r11)
+ std r9, 0(rp)
+ addc r8, r7, r8
+ std r8, 8(rp)
+ addze r3, r5
+ blr
+
+L(1): maddld( r8, r0, v0, r11)
+ std r8, 0(rp)
+ maddhdu(r3, r0, v0, r11)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm
new file mode 100644
index 0000000..846a894
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm
@@ -0,0 +1,193 @@
+dnl Power9 mpn_addmul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C power9: 1.62
+
+C STATUS
+C * Not written with any power9 pipeline understanding.
+C * The 4x unrolling was not motivated by any timing tests.
+C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5') C Note: Reused as scratch
+define(`vp', `r6') C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ subfic r0, r1, 0 C clear CA
+ subfo r0, r0, r0 C clear OV and r0
+
+ cmpdi cr7, n, 4
+
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+
+ srdi r10, n, 2
+ mtctr r10
+
+ rldicl. r9, n, 0, 63
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+
+ ld r28, 0(rp)
+ ld r8, 0(up)
+ ld r11, 8(rp)
+ ld r9, 8(up)
+ maddld( r26, r8, v0, r28)
+ maddhdu(r31, r8, v0, r28)
+ blt cr7, L(2)
+ ld r28, 16(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ bne cr0, L(b10)
+
+L(b00): addi up, up, -8
+ addi rp, rp, -24
+ b L(lo0)
+
+L(b10): addi up, up, 8
+ addi rp, rp, -8
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+
+ ld r29, 0(rp)
+ ld r9, 0(up)
+ ld r10, 8(rp)
+ ld r8, 8(up)
+ maddld( r27, r9, v0, r29)
+ maddhdu(r30, r9, v0, r29)
+ ld r29, 16(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 16
+ blt cr7, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r26, r8, v0, r10) C 0 4 -> adde
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r27, r0 C 7 11
+ ld r28, 24(rp)
+ std r0, 0(rp)
+ maddld( r5, r8, v1, r29) C 1 5 -> addex
+ maddhdu(r10, r8, v1, r29) C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(lo2): ld r8, 8(up)
+ maddld( r27, r9, v0, r11) C 1 5 -> adde
+ maddhdu(r30, r9, v0, r11) C 2 6
+ adde r0, r26, r0 C 8 12
+ ld r29, 32(rp)
+ std r0, 8(rp)
+ maddld( r12, r9, v1, r28) C 2 6 -> addex
+ maddhdu(r11, r9, v1, r28) C 3 7
+ addex( r0, r5, r31, 0) C 5 9 13
+L(lo1): ld r9, 16(up)
+ maddld( r26, r8, v0, r10) C 2 6 -> adde
+ maddhdu(r31, r8, v0, r10) C 3 7
+ adde r0, r27, r0 C 5 9 13
+ ld r28, 40(rp)
+ std r0, 16(rp)
+ maddld( r5, r8, v1, r29) C 3 7 -> addex
+ maddhdu(r10, r8, v1, r29) C 4 8
+ addex( r0, r12, r30, 0) C 6 10
+L(lo0): ld r8, 24(up)
+ maddld( r27, r9, v0, r11) C 3 7 -> adde
+ maddhdu(r30, r9, v0, r11) C 4 8
+ adde r0, r26, r0 C 6 10
+ ld r29, 48(rp)
+ std r0, 24(rp)
+ maddld( r12, r9, v1, r28) C 4 8 -> addex
+ maddhdu(r11, r9, v1, r28) C 5 9
+ addex( r0, r5, r31, 0) C 7 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r26, r8, v0, r10) C 0 4
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r27, r0 C 7 11
+ std r0, 0(rp) C -4
+ maddld( r5, r8, v1, r29) C 1 5
+ maddhdu(r10, r8, v1, r29) C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2
+ maddhdu(r30, r9, v0, r11) C 2 6 -1
+ adde r0, r26, r0 C 8 12 -3
+ std r0, 8(rp) C -3
+ mulld r12, r9, v1 C 2 6 -1
+ mulhdu r11, r9, v1 C 3 7 0 = return limb
+ addex( r0, r5, r31, 0) C 5 9 13
+ adde r0, r27, r0 C 5 9 13 -2
+ std r0, 16(rp) C -2
+ addex( r0, r12, r30, 0) C 6 10 -1
+ adde r0, r0, r10 C -1
+ std r0, 24(rp) C -1
+ li r4, 0
+ addze r3, r11
+ addex( r3, r3, r4, 0)
+
+L(ret): ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm
new file mode 100644
index 0000000..e4ca3a8
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm
@@ -0,0 +1,179 @@
+dnl POWER9 mpn_addmul_1 and mpn_submul_1.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mpn_addmul_1 mpn_submul_1
+C cycles/limb cycles/limb
+C POWER3/PPC630 - -
+C POWER4/PPC970 - -
+C POWER5 - -
+C POWER6 - -
+C POWER7 - -
+C POWER8 - -
+C POWER9 2.63 2.63
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUBC', adde)
+ define(`ADDSUB', addc)
+ define(`func', mpn_addmul_1)
+ define(`AM', `$1')
+ define(`SM', `')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUBC', subfe)
+ define(`ADDSUB', subfc)
+ define(`func', mpn_submul_1)
+ define(`AM', `')
+ define(`SM', `$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+ cmpdi cr7, n, 3
+ srdi r10, n, 2
+ mtctr r10
+ rldicl. r9, n, 0, 63
+ ld r11, 0(up)
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+AM(` subfzeo r12, n ') C ov = 0, ca = 0
+AM(` li r12, 0 ')
+SM(` subfco r12, r12, r12 ') C r12 = 0, ov = 0, ca = 1
+ ld r9, 8(up)
+ mulld r0, r11, v0
+ mulhdu r5, r11, v0
+ blt cr7, L(2)
+ ld r8, 16(up)
+ bne cr0, L(b10)
+
+L(b00): addi rp, rp, -24
+ b L(lo0)
+L(b10): addi rp, rp, -8
+ addi up, up, 16
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+AM(` subfzeo r5, n ') C ov = 0, ca = 0
+AM(` li r5, 0 ')
+SM(` subfco r5, r5, r5 ') C r5 = 0, ov = 0, ca = 1
+ blt cr7, L(1)
+ ld r8, 8(up)
+ mulld r7, r11, v0
+ mulhdu r12, r11, v0
+ ld r9, 16(up)
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ addi up, up, 8
+ b L(lo1)
+
+L(1): mulld r7, r11, v0
+ mulhdu r12, r11, v0
+ ld r11, 0(rp)
+ ADDSUB r10, r7, r11
+ std r10, 0(rp)
+AM(` addze r3, r12 ')
+SM(` subfe r0, r0, r0 ')
+SM(` sub r3, r12, r0 ')
+ blr
+
+L(b11): addi up, up, 24
+ ble cr7, L(end)
+
+ ALIGN(16)
+L(top): ld r11, 0(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ld r8, 0(up)
+ ADDSUBC r10, r7, r11
+ std r10, 0(rp)
+L(lo2): ld r11, 8(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ld r9, 8(up)
+ ADDSUBC r10, r0, r11
+ std r10, 8(rp)
+L(lo1): ld r11, 16(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ld r8, 16(up)
+ ADDSUBC r10, r7, r11
+ std r10, 16(rp)
+L(lo0): ld r11, 24(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ld r9, 24(up)
+ ADDSUBC r10, r0, r11
+ std r10, 24(rp)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r11, 0(rp)
+ mulld r0, r8, v0
+ addex( r7, r7, r5, 0)
+ mulhdu r5, r8, v0
+ ADDSUBC r10, r7, r11
+ std r10, 0(rp)
+L(cj2): ld r11, 8(rp)
+ mulld r7, r9, v0
+ addex( r0, r0, r12, 0)
+ mulhdu r12, r9, v0
+ ADDSUBC r10, r0, r11
+ std r10, 8(rp)
+ ld r11, 16(rp)
+ addex( r7, r7, r5, 0)
+ ADDSUBC r10, r7, r11
+ std r10, 16(rp)
+ li r0, 0
+ addex( r3, r12, r0, 0)
+AM(` addze r3, r3 ')
+SM(` subfe r0, r0, r0 ')
+SM(` sub r3, r3, r0 ')
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm
new file mode 100644
index 0000000..2dc982d
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm
@@ -0,0 +1,64 @@
+dnl PowerPC-64 mpn_gcd_11.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 5.75
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0', `r3')
+define(`v0', `r4')
+
+define(`cnt', `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+ b L(odd)
+
+ ALIGN(16)
+L(top): isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |v - u|
+ srd u0, u0, cnt
+L(odd): subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ cmpld cr7, v0, u0
+ cnttzd cnt, r10
+ bne cr7, L(top)
+
+L(end): blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm
new file mode 100644
index 0000000..12d11b0
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm
@@ -0,0 +1,143 @@
+dnl PowerPC-64 mpn_gcd_22 optimised for POWER9.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/bit (approx)
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 9.58
+
+C We define SLOW if this target uses a slow struct return mechanism, with
+C r3 as an implicit parameter for the struct pointer.
+undefine(`SLOW')dnl
+ifdef(`AIX',`define(`SLOW',`due to AIX')',`
+ ifdef(`DARWIN',,`
+ ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl
+ ')
+')
+
+ifdef(`SLOW',`
+define(`IFSLOW', `$1')
+define(`u1', `r4')
+define(`u0', `r5')
+define(`v1', `r6')
+define(`v0', `r7')
+',`
+define(`IFSLOW', `')
+define(`u1', `r3')
+define(`u0', `r4')
+define(`v1', `r5')
+define(`v0', `r6')
+')
+
+define(`tmp', `r0')
+define(`t0', `r8')
+define(`t1', `r9')
+define(`s0', `r10')
+define(`s1', `r11')
+define(`cnt', `r12')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+ cmpld cr7, v0, u0
+L(top): subfc t0, v0, u0 C 0 12
+ beq cr7, L(lowz)
+ subfe t1, v1, u1 C 2 14
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subfc s0, u0, v0 C 0
+ subfe s1, u1, v1 C 2
+
+L(bck): cnttzd cnt, t0 C 2
+ subfic tmp, cnt, 64 C 4
+
+ isel v0, v0, u0, 2 C 6 use condition set by subfe
+ isel u0, t0, s0, 2 C 6
+ isel v1, v1, u1, 2 C 6
+ isel u1, t1, s1, 2 C 6
+
+ srd u0, u0, cnt C 8
+ sld tmp, u1, tmp C 8
+ srd u1, u1, cnt C 8
+ or u0, u0, tmp C 10
+
+ or. r0, u1, v1 C 10
+ cmpld cr7, v0, u0
+ bne L(top)
+
+
+ b L(odd)
+ ALIGN(16)
+L(top1):isel v0, u0, v0, 29 C v = min(u,v)
+ isel u0, r10, r11, 29 C u = |u - v|
+ srd u0, u0, cnt
+L(odd): subf r10, u0, v0 C r10 = v - u
+ subf r11, v0, u0 C r11 = u - v
+ cmpld cr7, v0, u0
+ cnttzd cnt, r10
+ bne cr7, L(top1)
+
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std r10, 8(r3)
+',`
+ mr r3, v0
+ li r4, 0
+')
+ blr
+
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ subfc. t0, v1, u1 C 2 8
+ beq L(end)
+ li t1, 0
+ subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit
+ subf s0, u1, v1 C 2
+ li s1, 0
+ b L(bck)
+
+L(end):
+ifdef(`SLOW',`
+ std v0, 0(r3)
+ std v1, 8(r3)
+ blr
+',`
+ mr r3, v0
+ mr r4, v1
+ blr
+')
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h
new file mode 100644
index 0000000..f29a84e
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h
@@ -0,0 +1,254 @@
+/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2200MHz POWER9 */
+/* FFT tuning limit = 221,245,838 */
+/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11
+#define USE_PREINV_DIVREM_1 0
+/* From gcc120.osuosl.org, 2023-07-27 */
+#define DIV_QR_1N_PI1_METHOD 3 /* 6.48% faster than 4 */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD 2
+#define DIV_QR_2_PI2_THRESHOLD 7
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 33
+
+#define DIV_1_VS_MUL_1_PERCENT 365
+
+#define MUL_TOOM22_THRESHOLD 34
+#define MUL_TOOM33_THRESHOLD 109
+#define MUL_TOOM44_THRESHOLD 458
+#define MUL_TOOM6H_THRESHOLD 517
+#define MUL_TOOM8H_THRESHOLD 608
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 292
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 204
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 46
+#define SQR_TOOM3_THRESHOLD 158
+#define SQR_TOOM4_THRESHOLD 674
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 898
+
+#define MULMID_TOOM42_THRESHOLD 70
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 25
+
+#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 404, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 13, 5}, { 27, 6}, { 27, 7}, { 14, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 35, 8}, { 71, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,11}, { 367,10}, \
+ { 735,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,10}, \
+ { 895,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \
+ { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \
+ { 415,11}, { 831,10}, { 1663,11}, { 863,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \
+ { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \
+ { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \
+ { 703,12}, { 1471,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,13}, { 895,11}, \
+ { 3583,12}, { 1919,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
+ { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 243
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 29, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \
+ { 95,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,10}, { 1407,11}, \
+ { 735,13}, { 191,12}, { 383,11}, { 767,10}, \
+ { 1535,12}, { 415,11}, { 831,12}, { 447,11}, \
+ { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \
+ { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,13}, { 575,12}, { 1215,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \
+ { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4479,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,16}, { 1023,15}, \
+ { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5119,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \
+ { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 3
+#define MULLO_DC_THRESHOLD 39
+#define MULLO_MUL_N_THRESHOLD 7246
+#define SQRLO_BASECASE_THRESHOLD 6
+#define SQRLO_DC_THRESHOLD 40
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 30
+#define DC_DIVAPPR_Q_THRESHOLD 88
+#define DC_BDIV_QR_THRESHOLD 35
+#define DC_BDIV_Q_THRESHOLD 62
+
+#define INV_MULMOD_BNM1_THRESHOLD 79
+#define INV_NEWTON_THRESHOLD 11
+#define INV_APPR_THRESHOLD 11
+
+#define BINV_NEWTON_THRESHOLD 264
+#define REDC_1_TO_REDC_2_THRESHOLD 8
+#define REDC_2_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 1442
+#define MU_DIVAPPR_Q_THRESHOLD 1470
+#define MUPI_DIV_QR_THRESHOLD 0 /* always */
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1652
+
+#define POWM_SEC_TABLE 1,16,151,839
+
+#define GET_STR_DC_THRESHOLD 7
+#define GET_STR_PRECOMPUTE_THRESHOLD 15
+#define SET_STR_DC_THRESHOLD 406
+#define SET_STR_PRECOMPUTE_THRESHOLD 885
+
+#define FAC_DSC_THRESHOLD 179
+#define FAC_ODD_THRESHOLD 53
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1 /* 9.10% faster than 3 */
+#define HGCD_THRESHOLD 45
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2479
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 258
+#define JACOBI_BASE_METHOD 4 /* 15.45% faster than 1 */
+
+/* Tuneup completed successfully, took 179422 seconds */
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm
new file mode 100644
index 0000000..363f095
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm
@@ -0,0 +1,126 @@
+dnl Power9 mpn_mul_1.
+
+dnl Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+C POWER8 ?
+C POWER9 2.47
+
+C TODO
+C * Schedule for Power9 pipeline.
+C * Unroll 4x if that proves beneficial.
+C * This is marginally faster (but much smaller) than ../mul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ li r7, 0
+L(ent): ld r11, 0(up)
+ cmpdi cr6, n, 2
+ addi r0, n, -1 C FIXME: postpone
+ srdi r0, r0, 1 C FIXME: postpone
+ mtctr r0 C FIXME: postpone
+ rldicl. r12, n, 0,63 C r0 = n & 3, set cr0
+ bne cr0, L(b1)
+
+L(b0): ld r0, 8(up)
+ maddld( r9, r11, v0, r7)
+ maddhdu(r7, r11, v0, r7)
+ ble cr6, L(2)
+ ld r12, 16(up)
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ addic up, up, 16
+ addi rp, rp, -8
+ b L(mid)
+
+L(b1): ld r0, 0(up)
+ ble cr6, L(1)
+ ld r12, 8(up)
+ maddld( r8, r11, v0, r7)
+ maddhdu(r5, r11, v0, r7)
+ ld r0, 16(up)
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addic up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r12, 0(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+L(mid): ld r0, 8(up)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(top)
+
+L(end): std r8, 0(rp)
+ mulld r8, r0, v0
+ adde r9, r5, r9
+ mulhdu r5, r0, v0
+ std r9, 8(rp)
+ adde r8, r7, r8
+ std r8, 16(rp)
+ addze r3, r5
+ blr
+
+L(2): mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ std r9, 0(rp)
+ addc r8, r7, r8
+ std r8, 8(rp)
+ addze r3, r5
+ blr
+
+L(1): maddld( r8, r0, v0, r7)
+ std r8, 0(rp)
+ maddhdu(r3, r0, v0, r7)
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm
new file mode 100644
index 0000000..01b50a3
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm
@@ -0,0 +1,181 @@
+dnl Power9 mpn_mul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C power9: 1.58
+
+C STATUS
+C * Not written with any power9 pipeline understanding.
+C * The 4x unrolling was not motivated by any timing tests.
+C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5') C Note: Reused as scratch
+define(`vp', `r6') C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ subfic r0, n, 0 C clear CA
+ subfo r0, r0, r0 C clear OV and r0
+
+ cmpdi cr7, n, 4
+
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+
+ srdi r10, n, 2
+ mtctr r10
+
+ rldicl. r9, n, 0, 63
+ bne cr0, L(bx1)
+
+L(bx0): rldicl. r9, n, 63, 63
+
+ ld r8, 0(up)
+ ld r9, 8(up)
+ li r11, 0
+ mulld r28, r8, v0
+ mulhdu r31, r8, v0
+ blt cr7, L(2)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ bne cr0, L(b10)
+
+L(b00): addi up, up, -8
+ addi rp, rp, -24
+ b L(lo0)
+
+L(b10): addi up, up, 8
+ addi rp, rp, -8
+ b L(lo2)
+
+L(2): addi rp, rp, -8
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj2)
+
+L(bx1): rldicl. r9, n, 63, 63
+
+ ld r9, 0(up)
+ ld r8, 8(up)
+ li r10, 0
+ mulld r29, r9, v0
+ mulhdu r30, r9, v0
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ bne cr0, L(b11)
+
+L(b01): addi rp, rp, -16
+ b L(lo1)
+L(b11): addi up, up, 16
+ blt cr7, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, v0, r10) C 0 4 -> adde
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r29, r0 C 7 11
+ std r0, 0(rp)
+ mulld r5, r8, v1 C 1 5 -> addex
+ mulhdu r10, r8, v1 C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, v0, r11) C 1 5 -> adde
+ maddhdu(r30, r9, v0, r11) C 2 6
+ adde r0, r28, r0 C 8 12
+ std r0, 8(rp)
+ mulld r12, r9, v1 C 2 6 -> addex
+ mulhdu r11, r9, v1 C 3 7
+ addex( r0, r5, r31, 0) C 5 9 13
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, v0, r10) C 2 6 -> adde
+ maddhdu(r31, r8, v0, r10) C 3 7
+ adde r0, r29, r0 C 5 9 13
+ std r0, 16(rp)
+ mulld r5, r8, v1 C 3 7 -> addex
+ mulhdu r10, r8, v1 C 4 8
+ addex( r0, r12, r30, 0) C 6 10
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, v0, r11) C 3 7 -> adde
+ maddhdu(r30, r9, v0, r11) C 4 8
+ adde r0, r28, r0 C 6 10
+ std r0, 24(rp)
+ mulld r12, r9, v1 C 4 8 -> addex
+ mulhdu r11, r9, v1 C 5 9
+ addex( r0, r5, r31, 0) C 7 11
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, v0, r10) C 0 4
+ maddhdu(r31, r8, v0, r10) C 1 5
+ adde r0, r29, r0 C 7 11
+ std r0, 0(rp) C -4
+ mulld r5, r8, v1 C 1 5
+ mulhdu r10, r8, v1 C 2 6
+ addex( r0, r12, r30, 0) C 8 12
+L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2
+ maddhdu(r30, r9, v0, r11) C 2 6 -1
+ adde r0, r28, r0 C 8 12 -3
+ std r0, 8(rp) C -3
+ mulld r12, r9, v1 C 2 6 -1
+ mulhdu r11, r9, v1 C 3 7 0 = return limb
+ addex( r0, r5, r31, 0) C 5 9 13
+ adde r0, r29, r0 C 5 9 13 -2
+ std r0, 16(rp) C -2
+ addex( r0, r12, r30, 0) C 6 10 -1
+ adde r0, r0, r10 C -1
+ std r0, 24(rp) C -1
+ li r4, 0
+ addze r3, r11
+ addex( r3, r3, r4, 0)
+
+L(ret): ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm
new file mode 100644
index 0000000..8f3d322
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm
@@ -0,0 +1,415 @@
+dnl Power9 mpn_mul_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 1.62
+
+C TODO
+C * Check if (inner) loop alignment affects performance.
+C * Could we schedule loads less in addmul_2/mul_2? That would save some regs
+C and make the tail code more manageable.
+C * Postpone some register saves to main loop.
+C * Perhaps write more small operands (3x1, 3x2, 3x3) code.
+C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2.
+C On the other hand, the current rp,up restore register are useful for OSP.
+C * Do OSP. This should save a lot with the current deep addmul_2 pipeline.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r0')
+define(`v1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+ cmpdi cr0, un, 2
+ bgt cr0, L(un_gt2)
+ cmpdi cr6, vn, 1
+ ld r7, 0(vp)
+ ld r5, 0(up)
+ mulld r8, r5, r7 C weight 0
+ mulhdu r9, r5, r7 C weight 1
+ std r8, 0(rp)
+ beq cr0, L(2x)
+ std r9, 8(rp)
+ blr
+ ALIGN(16)
+L(2x): ld r0, 8(up)
+ mulld r8, r0, r7 C weight 1
+ mulhdu r10, r0, r7 C weight 2
+ addc r9, r9, r8
+ addze r10, r10
+ bne cr6, L(2x2)
+ std r9, 8(rp)
+ std r10, 16(rp)
+ blr
+ ALIGN(16)
+L(2x2): ld r6, 8(vp)
+ mulld r8, r5, r6 C weight 1
+ mulhdu r11, r5, r6 C weight 2
+ addc r9, r9, r8
+ std r9, 8(rp)
+ adde r11, r11, r10
+ mulld r12, r0, r6 C weight 2
+ mulhdu r0, r0, r6 C weight 3
+ addze r0, r0
+ addc r11, r11, r12
+ addze r0, r0
+ std r11, 16(rp)
+ std r0, 24(rp)
+ blr
+
+L(un_gt2):
+ std r22, -80(r1)
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ mr rp2, r3 C rp
+ mr up2, r4 C up
+ srdi r22, r5, 2 C un
+ subfic r23, r7, 0 C -vn, clear CA
+ subfo r0, r0, r0 C clear OV (and r0)
+
+ cmpdi cr6, un, 3
+ rldicl r0, un, 0, 63 C r0 = un & 1
+ cmpdi cr7, r0, 0
+ rldicl r0, un, 63, 63 C FIXME: unused for vn = 1
+ cmpdi cr5, r0, 0 C FIXME: unused for vn = 1
+
+ ld v0, 0(vp)
+ rldicl. r9, vn, 0, 63
+ beq cr0, L(vn_evn)
+
+L(vn_odd):
+ addi r10, un, -2
+ ld r5, 0(up)
+ srdi r10, r10, 1
+ mtctr r10
+ bne cr7, L(m1_b1)
+
+L(m1_b0):
+ ld r10, 8(up)
+ mulld r9, r5, v0
+ mulhdu r11, r5, v0
+ ld r12, 16(up)
+ mulld r8, r10, v0
+ mulhdu r5, r10, v0
+ addi rp, rp, -8
+ b L(m1_mid)
+
+L(m1_b1):
+ ld r12, 8(up)
+ mulld r8, r5, v0
+ mulhdu r5, r5, v0
+ ld r10, 16(up)
+ mulld r9, r12, v0
+ mulhdu r11, r12, v0
+ addi up, up, 8
+ beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3}
+
+ ALIGN(16)
+L(m1_top):
+ ld r12, 16(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ mulld r8, r10, v0
+ mulhdu r5, r10, v0
+L(m1_mid):
+ ld r10, 24(up)
+ std r9, 8(rp)
+ adde r8, r11, r8
+ mulld r9, r12, v0
+ mulhdu r11, r12, v0
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(m1_top)
+
+L(m1_end):
+ std r8, 0(rp)
+ mulld r8, r10, v0
+ adde r9, r5, r9
+ mulhdu r5, r10, v0
+ std r9, 8(rp)
+ adde r8, r11, r8
+ std r8, 16(rp)
+ addze r10, r5
+ std r10, 24(rp)
+
+ addi rp2, rp2, 8
+ addi vp, vp, 8
+ addic. r23, r23, 1
+ b L(do_outer)
+
+L(vn_evn):
+ ld v1, 8(vp)
+ addi r23, r23, 2
+ mtctr r22
+ bne cr7, L(m2_bx1)
+
+L(m2_bx0):
+ ld r8, 0(up)
+ ld r9, 8(up)
+ li r11, 0
+ mulld r28, r8, v0
+ mulhdu r31, r8, v0
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ li r12, 0
+ bne cr5, L(m2_b10)
+
+L(m2_b00):
+ addi up, up, -8
+ addi rp, rp, -24
+ b L(m2_lo0)
+
+L(m2_b10):
+ addi up, up, 8
+ addi rp, rp, -8
+ b L(m2_lo2)
+
+L(m2_bx1):
+ ld r9, 0(up)
+ ld r8, 8(up)
+ li r10, 0
+ mulld r29, r9, v0
+ mulhdu r30, r9, v0
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ li r5, 0
+ bne cr5, L(m2_b11)
+
+L(m2_b01):
+ addi rp, rp, -16
+ b L(m2_lo1)
+L(m2_b11):
+ addi up, up, 16
+ beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done.
+
+L(m2_top):
+ ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ addex( r12, r12, r30, 0)
+L(m2_lo2):
+ ld r8, 8(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+L(m2_lo1):
+ ld r9, 16(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ addex( r12, r12, r30, 0)
+L(m2_lo0):
+ ld r8, 24(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 24(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(m2_top)
+
+L(m2_end):
+ ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ b L(cj)
+
+L(outer):
+ ld v0, 0(vp)
+ ld v1, 8(vp)
+ addi r23, r23, 2
+ mtctr r22
+ bne cr7, L(bx1)
+
+L(bx0): ld r26, 0(rp2)
+ ld r8, 0(up2)
+ ld r11, 8(rp2)
+ ld r9, 8(up2)
+ maddld( r28, r8, v0, r26)
+ maddhdu(r31, r8, v0, r26)
+ ld r26, 16(rp2)
+ mulld r5, r8, v1
+ mulhdu r10, r8, v1
+ li r12, 0
+ bne cr5, L(b10)
+
+L(b00): addi up, up2, -8
+ addi rp, rp2, -24
+ b L(lo0)
+
+L(b10): addi up, up2, 8
+ addi rp, rp2, -8
+ b L(lo2)
+
+L(bx1): ld r27, 0(rp2)
+ ld r9, 0(up2)
+ ld r10, 8(rp2)
+ ld r8, 8(up2)
+ maddld( r29, r9, v0, r27)
+ maddhdu(r30, r9, v0, r27)
+ ld r27, 16(rp2)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ li r5, 0
+ bne cr5, L(b11)
+
+L(b01): addi up, up2, 0
+ addi rp, rp2, -16
+ b L(lo1)
+L(b11): addi up, up2, 16
+ addi rp, rp2, 0
+ beq cr6, L(end) C taken means un = 3, vn = 3. We're done.
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ ld r26, 24(rp)
+ std r5, 0(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+ addex( r12, r12, r30, 0)
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ ld r27, 32(rp)
+ std r12, 8(rp)
+ maddld( r12, r9, v1, r26)
+ maddhdu(r11, r9, v1, r26)
+ addex( r5, r5, r31, 0)
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ ld r26, 40(rp)
+ std r5, 16(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+ addex( r12, r12, r30, 0)
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ ld r27, 48(rp)
+ std r12, 24(rp)
+ maddld( r12, r9, v1, r26)
+ maddhdu(r11, r9, v1, r26)
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, v0, r10)
+ maddhdu(r31, r8, v0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ maddld( r5, r8, v1, r27)
+ maddhdu(r10, r8, v1, r27)
+L(cj): addex( r12, r12, r30, 0)
+ maddld( r29, r9, v0, r11)
+ maddhdu(r30, r9, v0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, v1
+ mulhdu r11, r9, v1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 24(rp)
+ li r4, 0
+ addze r5, r11
+ addex( r5, r5, r4, 0)
+ std r5, 32(rp)
+
+ cmpdi cr0, r23, 0
+ addi rp2, rp2, 16
+ addi vp, vp, 16
+L(do_outer):
+ bne cr0, L(outer)
+L(ret):
+ ld r22, -80(r1)
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm
new file mode 100644
index 0000000..2d4fa63
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm
@@ -0,0 +1,555 @@
+dnl Power9 mpn_sqr_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 1.62
+
+C TODO
+C * Completely separate evn and odd code into two outer loops. Also consider
+C unrolling these two outer loops and thereby eliminate all branches.
+C * Avoid the reloading of u1 before every loop start.
+C * Reduce register usage.
+C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
+C * Consider skewing conditional adjustments to allow mask creation with subfe
+C like in the un=3 code. It might streamline the adjustments (or not).
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+
+define(`u0', `r0')
+define(`u1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+define(`cy', `r6')
+
+define(`LSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1U',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1UF',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+')
+define(`LSHU1UHF',`
+ add u0, u0, u0
+ add u0, u0, cy
+')
+C These are cleverer replacements, but they tend to leave CA set, disturbing
+C the main accumulation code! Breaking that false dependency might have a
+C positive performance impact. Note that the subfe here results in a mask for
+C our adjustments.
+define(`xLSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ ld r0, 0(up) C n = 1
+ mulld r8, r0, r0 C weight 0
+ mulhdu r9, r0, r0 C weight 1
+ std r8, 0(rp)
+ cmpdi cr0, un, 2
+ bge cr0, L(ge2)
+ std r9, 8(rp)
+ blr
+
+L(ge2): bgt cr0, L(gt2)
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
+ addc r9, r9, r4
+ adde r10, r10, r5
+ addze r11, r11
+ std r9, 8(rp)
+ std r10, 16(rp)
+ std r11, 24(rp)
+ blr
+
+L(gt2): cmpdi cr0, un, 3
+ bgt cr0, L(gt3)
+ std r30, -16(r1)
+ std r31, -8(r1)
+ subfo r12, r12, r12 C clear OV (and result register)
+ ld r8, 8(r4)
+ mulld r5, r8, r8 C W2
+ mulhdu r10, r8, r8 C W3
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, r8 C W3
+ addc u0, u0, u0
+ adde u1, r8, r8
+ subfe r6, r6, r6 C mask
+ ld r4, 16(r4) C W2
+ mulld r12, r8, u0 C W1 u1 x u0
+ mulhdu r8, r8, u0 C W2 u1 x u0
+ maddld( r31, r4, u0, r11) C W2
+ maddhdu(r30, r4, u0, r11) C W3
+ andc r6, r4, r6 C W4
+ addc r9, r12, r9 C W1
+ std r9, 8(rp) C W1
+ mulld r9, r4, u1 C W3
+ mulhdu r11, r4, u1 C W4
+ addex( r5, r5, r8, 0) C W2
+ adde r5, r31, r5 C W2
+ std r5, 16(rp) C W2
+ maddld( r5, r4, r4, r6) C W4 u2^2
+ maddhdu(r6, r4, r4, r6) C W5 u2^2
+ addex( r9, r9, r30, 0) C W3
+ adde r9, r9, r10 C W3
+ std r9, 24(rp) C W3
+ adde r5, r5, r11 C W4
+ addze r6, r6 C W5
+ li r8, 0
+ addex( r5, r5, r8, 0) C W4
+ std r5, 32(rp) C W4
+ addex( r6, r6, r8, 0) C W5
+ std r6, 40(rp) C W5
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+
+L(gt3): std r22, -80(r1)
+ std r23, -72(r1)
+ std r24, -64(r1)
+ std r25, -56(r1)
+ std r26, -48(r1)
+ std r27, -40(r1)
+ std r28, -32(r1)
+ std r29, -24(r1)
+ std r30, -16(r1)
+ std r31, -8(r1)
+
+ mr rp2, rp
+ mr up2, up
+ addi r22, un, -1 C count for loop FIXME: Adjust
+ subfo r0, r0, r0 C clear OV (and r0)
+ rldicl r0, un, 0, 63 C r0 = un & 1
+ cmpdi cr7, r0, 0
+
+ ld u0, 0(up2)
+ ld u1, 8(up2)
+
+ cmpdi cr5, r22, 4
+ srdi r31, r22, 2
+ addi r22, r22, -2
+ mtctr r31
+
+ beq cr7, L(m2_evn)
+L(m2_odd):
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ mulld r23, u0, u0
+ mulhdu r12, u0, u0
+ mulld r5, u1, u1
+ mulhdu r10, u1, u1
+
+ sradi r11, u0, 63
+ and r11, r11, u1
+
+ LSHU1U0
+
+ ld r8, 8(up2)
+ ld r9, 16(up2)
+ mulld r28, r8, u0 C W u1 x u0
+ mulhdu r31, r8, u0 C W u1 x u0
+ std r23, 0(rp2)
+
+ bne cr0, L(m2_11)
+L(m2_01):
+ addi up, up2, 16
+ addi rp, rp2, 0
+ b L(m2_lo2)
+L(m2_11):
+ addi up, up2, 0
+ addi rp, rp2, -16
+ b L(m2_lo0)
+
+L(m2_evn):
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ mulld r23, u0, u0
+ mulhdu r5, u0, u0
+ mulld r12, u1, u1
+ mulhdu r11, u1, u1
+
+ sradi r10, u0, 63
+ and r10, r10, u1
+
+ LSHU1U0
+
+ ld r9, 8(up2)
+ ld r8, 16(up2)
+ mulld r29, r9, u0 C W u1 x u0
+ mulhdu r30, r9, u0 C W u1 x u0
+ std r23, 0(rp2)
+
+ beq cr0, L(m2_10)
+L(m2_00):
+ addi up, up2, 8
+ addi rp, rp2, -8
+ b L(m2_lo1)
+L(m2_10):
+ addi up, up2, 24
+ addi rp, rp2, 8
+ ble cr5, L(m2_end)
+
+L(m2_top):
+ ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ addex( r12, r12, r30, 0)
+L(m2_lo2):
+ ld r8, 8(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+L(m2_lo1):
+ ld r9, 16(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ addex( r12, r12, r30, 0)
+L(m2_lo0):
+ ld r8, 24(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 24(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(m2_top)
+
+L(m2_end):
+ ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ mulld r5, r8, u1
+ mulhdu r10, r8, u1
+ b L(cj) C jump to addmul_2 tail
+
+L(outer):
+ addi up2, up2, 16
+ addi rp2, rp2, 32
+
+ ld u0, 0(up2)
+ ld u1, 8(up2)
+
+ cmpdi cr5, r22, 4
+ srdi r31, r22, 2
+ addi r22, r22, -2
+ mtctr r31
+
+ ld r26, 0(rp2)
+ ld r27, 16(rp2)
+
+ rldicl. r31, r22, 63, 63 C r22 & 2
+ beq cr7, L(evn)
+
+L(odd): maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r12, u0, u0, r26) C W u2^2
+ maddld( r5, u1, u1, r27) C W u3^2
+ maddhdu(r10, u1, u1, r27) C W u3^2
+ ld r26, 8(rp2)
+
+ ld r8, -8(up2)
+ sradi r8, r8, 63 C CAUTION: clobbers CA
+ and r8, r8, u0
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, u1
+
+ LSHU1U
+
+ addc r23, r23, r8
+
+ ld r8, 8(up2)
+ ld r9, 16(up2)
+ maddld( r28, r8, u0, r26) C W u3 x u2
+ maddhdu(r31, r8, u0, r26) C W u3 x u2
+ ld r26, 24(rp2)
+ std r23, 0(rp2) C W0
+
+ bne cr0, L(11)
+L(01):
+ addi up, up2, 16
+ addi rp, rp2, 0
+ b L(lo2)
+L(11):
+ addi up, up2, 0
+ addi rp, rp2, -16
+ b L(lo0)
+
+L(evn): maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r5, u0, u0, r26) C W u2^2
+ maddld( r12, u1, u1, r27) C W u3^2
+ maddhdu(r11, u1, u1, r27) C W u3^2
+ ld r27, 8(rp2)
+
+ ld r9, -8(up2)
+ sradi r9, r9, 63 C CAUTION: clobbers CA
+ and r9, r9, u0
+ sradi r10, u0, 63 C CAUTION: clobbers CA
+ and r10, r10, u1
+
+ LSHU1U
+
+ addc r23, r23, r9
+
+ ld r9, 8(up2)
+ ld r8, 16(up2)
+ maddld( r29, r9, u0, r27) C W u3 x u2
+ maddhdu(r30, r9, u0, r27) C W u3 x u2
+ ld r27, 24(rp2)
+ std r23, 0(rp2) C W0
+
+ beq cr0, L(10)
+L(00):
+ addi up, up2, 8
+ addi rp, rp2, -8
+ b L(lo1)
+L(10):
+ addi up, up2, 24
+ addi rp, rp2, 8
+ ble cr5, L(end)
+
+L(top): ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ ld r26, 24(rp)
+ std r5, 0(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+ addex( r12, r12, r30, 0)
+L(lo2): ld r8, 8(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ ld r27, 32(rp)
+ std r12, 8(rp)
+ maddld( r12, r9, u1, r26)
+ maddhdu(r11, r9, u1, r26)
+ addex( r5, r5, r31, 0)
+L(lo1): ld r9, 16(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ ld r26, 40(rp)
+ std r5, 16(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+ addex( r12, r12, r30, 0)
+L(lo0): ld r8, 24(up)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ ld r27, 48(rp)
+ std r12, 24(rp)
+ maddld( r12, r9, u1, r26)
+ maddhdu(r11, r9, u1, r26)
+ addex( r5, r5, r31, 0)
+ addi up, up, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+L(end): ld r9, 0(up)
+ maddld( r28, r8, u0, r10)
+ maddhdu(r31, r8, u0, r10)
+ adde r5, r29, r5
+ std r5, 0(rp)
+ maddld( r5, r8, u1, r27)
+ maddhdu(r10, r8, u1, r27)
+L(cj): addex( r12, r12, r30, 0)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 8(rp)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 16(rp)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 24(rp)
+ li r4, 0
+ addze r5, r11
+ addex( r5, r5, r4, 0)
+ std r5, 32(rp)
+ bgt cr5, L(outer)
+
+L(corner):
+ ld u0, 16(up2)
+ ld u1, 24(up2)
+ ld r26, 32(rp2)
+ bne cr7, L(corner_odd)
+
+L(corner_evn):
+ ld r27, 40(rp2)
+ maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r5, u0, u0, r26) C W u2^2
+ mulld r12, u1, u1 C W u3^2
+ mulhdu r11, u1, u1 C W u3^2
+
+ ld r9, 8(up2)
+ sradi r9, r9, 63 C CAUTION: clobbers CA
+ and r9, r9, u0
+ sradi r10, u0, 63 C CAUTION: clobbers CA
+ and r10, r10, u1
+
+ LSHU1UHF
+
+ addc r23, r23, r9
+
+ ld r9, 24(up2)
+ maddld( r29, r9, u0, r27) C W u3 x u2
+ maddhdu(r30, r9, u0, r27) C W u3 x u2
+ std r23, 32(rp2)
+ adde r5, r29, r5
+ std r5, 40(rp2)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10 C W FIXME can this co?
+ std r12, 48(rp2)
+ li r4, 0
+ addex( r5, r11, r4, 0)
+ addze r5, r5
+ std r5, 56(rp2)
+ b L(ret)
+
+L(corner_odd):
+ ld r27, 48(rp2)
+ maddld( r23, u0, u0, r26) C W u2^2
+ maddhdu(r12, u0, u0, r26) C W u2^2
+ maddld( r5, u1, u1, r27) C W u3^2
+ maddhdu(r10, u1, u1, r27) C W u3^2
+ ld r26, 40(rp2)
+
+ ld r8, 8(up2)
+ sradi r8, r8, 63 C CAUTION: clobbers CA
+ and r8, r8, u0
+ sradi r11, u0, 63 C CAUTION: clobbers CA
+ and r11, r11, u1
+
+ LSHU1UF
+
+ addc r23, r23, r8
+
+ ld r8, 24(up2)
+ ld r9, 32(up2)
+ maddld( r28, r8, u0, r26) C W u3 x u2
+ maddhdu(r31, r8, u0, r26) C W u3 x u2
+ std r23, 32(rp2)
+ maddld( r29, r9, u0, r11)
+ maddhdu(r30, r9, u0, r11)
+ adde r12, r28, r12
+ std r12, 40(rp2)
+ mulld r12, r9, u1
+ mulhdu r11, r9, u1
+ addex( r5, r5, r31, 0)
+ adde r5, r29, r5
+ std r5, 48(rp2)
+ addex( r12, r12, r30, 0)
+ adde r12, r12, r10
+ std r12, 56(rp2)
+ mulld r23, r9, r9 C W u2^2
+ mulhdu r12, r9, r9 C W u2^2
+ adde r23, r23, r11
+ addze r12, r12
+ sradi r4, r8, 63 C CAUTION: clobbers CA
+ and r4, r4, r9
+ addex( r23, r23, r4, 0)
+ std r23, 64(rp2)
+ li r4, 0
+ addex( r12, r12, r4, 0)
+ std r12, 72(rp2)
+
+L(ret): ld r22, -80(r1)
+ ld r23, -72(r1)
+ ld r24, -64(r1)
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+EPILOGUE()
+ASM_END()