From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm | 189 +++++ gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm | 225 ++++++ gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm | 43 + gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm | 43 + gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm | 187 +++++ gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm | 132 ++++ gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm | 146 ++++ gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm | 196 +++++ gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm | 135 ++++ gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm | 274 +++++++ gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm | 187 +++++ gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm | 77 ++ gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h | 82 ++ gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm | 88 +++ gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm | 164 ++++ gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm | 270 +++++++ gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm | 132 ++++ gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm | 117 +++ gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm | 168 ++++ gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm | 708 +++++++++++++++++ gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h | 179 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h | 214 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h | 219 ++++++ gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm | 185 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h | 160 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm | 589 ++++++++++++++ gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm | 135 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm | 128 +++ .../mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm | 43 + .../mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm | 43 + .../mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm | 129 +++ gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm | 67 ++ gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm | 146 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h | 175 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h | 171 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm | 53 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm | 112 +++ .../mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm | 106 +++ gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm | 130 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm | 193 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm | 179 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm | 64 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm | 143 ++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h | 254 ++++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm | 126 +++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm | 181 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm | 415 ++++++++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm | 555 +++++++++++++ gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm | 173 +++++ gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm | 863 +++++++++++++++++++++ 50 files changed, 9693 insertions(+) create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm (limited to 'gmp-6.3.0/mpn/powerpc64/mode64') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm new file mode 100644 index 0000000..0e8474f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm @@ -0,0 +1,189 @@ +dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.63 +C POWER7 2.25-2.87 + +C This code is a little bit slower for POWER3/PPC630 than the simple code used +C previously, but it is much faster for POWER4/PPC970. The reason for the +C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4 +C registers. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + + rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srdi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(r4) C load s1 limb + ld r9, 0(r5) C load s2 limb + ld r10, 8(r4) C load s1 limb + ld r11, 8(r5) C load s2 limb + ld r12, 16(r4) C load s1 limb + addi r4, r4, 24 + ld r0, 16(r5) C load s2 limb + addi r5, r5, 24 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(r3) + std r30, 8(r3) + std r31, 16(r3) + addi r3, r3, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(r4) C load s1 limb + addi r4, r4, 8 + ld r0, 0(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r31, r0, r12 C add + std r31, 0(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(r4) C load s1 limb + ld r11, 0(r5) C load s2 limb + ld r12, 8(r4) C load s1 limb + addi r4, r4, 16 + ld r0, 8(r5) C load s2 limb + addi r5, r5, 16 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(r3) + std r31, 8(r3) + addi r3, r3, 16 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): ld r6, 0(r4) C load s1 limb + ld r7, 0(r5) C load s2 limb + ld r8, 8(r4) C load s1 limb + ld r9, 8(r5) C load s2 limb + ld r10, 16(r4) C load s1 limb + ld r11, 16(r5) C load s2 limb + ld r12, 24(r4) C load s1 limb + ld r0, 24(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 32 + addi r5, r5, 32 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + ld r6, 0(r4) C load s1 limb + ld r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(r4) C load s1 limb + ld r9, 8(r5) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(r4) C load s1 limb + ld r11, 16(r5) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(r4) C load s1 limb + ld r0, 24(r5) C load s2 limb + std r28, 0(r3) + addi r4, r4, 32 + std r29, 8(r3) + addi r5, r5, 32 + std r30, 16(r3) + std r31, 24(r3) + addi r3, r3, 32 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(r3) + std r29, 8(r3) + std r30, 16(r3) + std r31, 24(r3) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm new file mode 100644 index 0000000..0c12f9b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm @@ -0,0 +1,225 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 6-18 6-18 +C POWER4/PPC970 8 8.3 +C POWER5 8 8.25 +C POWER6 16.25 16.75 +C POWER7 3.77 4.9 + +C TODO +C * Try to reduce the number of needed live registers +C * Add support for _1c entry points + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vl', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(SM, `') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(SM, `$1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + std r30, -16(r1) + cmpdi cr6, r0, 2 + std r29, -24(r1) + addi n, n, 3 C compute count... + std r28, -32(r1) + srdi n, n, 2 C ...for ctr + std r27, -40(r1) + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r9, 0(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + ADDSUB r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 +SM(` subfe r11, r11, r11 ') + b L(bot) + + ALIGN(16) +L(b00): ld r9, 0(up) + ld r27, 8(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r7, r7, r5 + addze r12, r8 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 +SM(` subfe r11, r11, r11 ') + b L(bot) + + ALIGN(16) +L(b01): bdnz L(gt1) + ld r9, 0(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + ADDSUB r0, r0, r11 + std r0, 0(rp) +SM(` subfe r11, r11, r11 ') +SM(` addic r11, r11, 1 ') + addze r3, r8 + blr +L(gt1): ld r9, 0(up) + ld r27, 8(up) + mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r5 + adde r11, r11, r8 + addze r12, r10 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) + ADDSUBC r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 +SM(` subfe r11, r11, r11 ') + b L(bot) + +L(b10): addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r9, 0(up) + ld r27, 8(up) + bdz L(end) + addi up, up, 16 + + ALIGN(16) +L(top): mulld r0, r9, r6 + mulhdu r5, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r5 C 5 7 + mulld r5, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r5, r5, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + ADDSUB r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + ADDSUBC r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + ADDSUBC r5, r5, r30 C 5 30 + std r5, 16(rp) C 5 + ADDSUBC r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 +SM(` subfe r11, r11, r11 ') + addi rp, rp, 32 +L(bot): +SM(` addic r11, r11, 1 ') + bdnz L(top) + +L(end): mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r5 + addze r8, r8 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) +SM(` subfe r11, r11, r11 ') +SM(` addic r11, r11, 1 ') + addze r3, r8 + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..2c5400a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..447791a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm new file mode 100644 index 0000000..6158f54 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm @@ -0,0 +1,187 @@ +dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C POWER3/PPC630 1.83 (1.5 c/l should be possible) +C POWER4/PPC970 3 (2.0 c/l should be possible) +C POWER5 3 +C POWER6 3.5-47 +C POWER7 3 + +C STATUS +C * Try combining upx+up, and vpx+vp. +C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is +C greater than the 2nd operand. Yes, this addition is non-commutative wrt +C performance. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`DO_add', ` + define(`ADDSUBC', `addc $1, $2, $3') + define(`ADDSUBE', `adde $1, $2, $3') + define(INITCY, `addic $1, r1, 0') + define(RETVAL, `addze r3, $1') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUBC', `subfc $1, $2, $3') + define(`ADDSUBE', `subfe $1, $2, $3') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `subfze r3, $1 + neg r3, r3') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUBC', `subfc $1, $3, $2') + define(`ADDSUBE', `subfe $1, $3, $2') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `addme r3, $1') + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(`rpx', `r6') +define(`upx', `r7') +define(`vpx', `r12') + +define(`s0', `r0') define(`s1', `r9') +define(`u0', `r8') +define(`v0', `r10') define(`v1', `r11') + + +ASM_START() +PROLOGUE(func) + cmpldi cr0, n, 13 + bgt L(big) + + mtctr n C copy n in ctr + INITCY( r0) C clear cy + + ld v0, 0(vp) C load v limb + ld u0, 0(up) C load u limb + addi up, up, -8 C update up + addi rp, rp, -8 C update rp + sldi s1, v0, LSH + bdz L(ex1) C If done, skip loop + + ALIGN(16) +L(lo0): ld v1, 8(vp) C load v limb + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + ldu u0, 16(up) C load u limb and update up + srdi s0, v0, RSH C shift down previous v limb + std s1, 8(rp) C store result limb + rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb + bdz L(ex0) C decrement ctr and exit if done + ldu v0, 16(vp) C load v limb and update vp + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + ld u0, 8(up) C load u limb + srdi s1, v1, RSH C shift down previous v limb + stdu s0, 16(rp) C store result limb and update rp + rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb + bdnz L(lo0) C decrement ctr and loop back + +L(ex1): ADDSUBE(r7, s1, u0) + std r7, 8(rp) C store last result limb + srdi r0, v0, RSH + RETVAL( r0) + blr +L(ex0): ADDSUBE(r7, s0, u0) + std r7, 16(rp) C store last result limb + srdi r0, v1, RSH + RETVAL( r0) + blr + + +L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0 + addi r6, n, -1 C ...for ctr + srdi r6, r6, 1 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b0) + +L(b1): ld v1, 0(vp) + ld u0, 0(up) + sldi s1, v1, LSH + srdi s0, v1, RSH + ld v0, 8(vp) + ADDSUBC(s1, s1, u0) C add limbs without cy, set cy + addi rpx, rp, -16 + addi rp, rp, -8 + sub upx, up, rp + sub vpx, vp, rp + sub up, up, rpx + sub vp, vp, rpx + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + b L(mid) + +L(b0): ld v0, 0(vp) + ld u0, 0(up) + sldi s0, v0, LSH + srdi s1, v0, RSH + ld v1, 8(vp) + ADDSUBC(s0, s0, u0) C add limbs without cy, set cy + addi rpx, rp, -8 + addi rp, rp, -16 + sub upx, up, rpx + sub vpx, vp, rpx + sub up, up, rp + sub vp, vp, rp + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + + ALIGN(32) +L(top): ldx u0, rp, up + ldx v0, rp, vp + rldimi s1, v1, LSH, 0 + stdu s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy +L(mid): ldx u0, rpx, upx + ldx v1, rpx, vpx + rldimi s0, v0, LSH, 0 + stdu s1, 16(rpx) + srdi s1, v0, RSH + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + bdnz L(top) C decrement CTR and loop back + + ldx u0, rp, up + rldimi s1, v1, LSH, 0 + std s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + std s1, 24(rp) + + RETVAL( s0) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm new file mode 100644 index 0000000..45cded9 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm @@ -0,0 +1,132 @@ +dnl PPC64 mpn_bdiv_dbm1c. + +dnl Copyright 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8.25 +C POWER5 8.5 fluctuating as function of n % 3 +C POWER6 15 +C POWER7 4.75 + +C TODO +C * Nothing to do... + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`bd', `r6') +define(`cy', `r7') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + ld r0, 0(r4) + + rldicl. r12, r5, 0,62 + cmpldi cr6, r12, 2 + cmpldi cr7, r5, 4 + addi r5, r5, 1 + srwi r5, r5, 2 + mtctr r5 + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + + ALIGN(16) +L(b11): mulld r5, r0, r6 + mulhdu r12, r0, r6 + ld r0, 8(r4) + addi r4, r4, -24 + addi r3, r3, -24 + b L(3) + + ALIGN(16) +L(b00): mulld r9, r0, r6 + mulhdu r8, r0, r6 + addi r4, r4, -16 + addi r3, r3, -16 + b L(0) + + ALIGN(16) +L(b01): mulld r5, r0, r6 + mulhdu r12, r0, r6 + addi r3, r3, -8 + ble cr7, L(e1) + ld r0, 8(r4) + addi r4, r4, -8 + b L(1) + + ALIGN(16) +L(b10): mulld r9, r0, r6 + mulhdu r8, r0, r6 + ble cr7, L(e2) + + ALIGN(16) +L(top): subfc r11, r9, r7 + ld r10, 8(r4) + ld r0, 16(r4) + subfe r7, r8, r11 + std r11, 0(r3) + mulld r5, r10, r6 + mulhdu r12, r10, r6 +L(1): mulld r9, r0, r6 + mulhdu r8, r0, r6 + subfc r11, r5, r7 + subfe r7, r12, r11 + std r11, 8(r3) +L(0): subfc r11, r9, r7 + ld r10, 24(r4) + ld r0, 32(r4) + subfe r7, r8, r11 + std r11, 16(r3) + mulld r5, r10, r6 + mulhdu r12, r10, r6 +L(3): mulld r9, r0, r6 + mulhdu r8, r0, r6 + subfc r11, r5, r7 + subfe r7, r12, r11 + std r11, 24(r3) + addi r4, r4, 32 + addi r3, r3, 32 + bdnz L(top) + +L(e2): ld r10, 8(r4) + mulld r5, r10, r6 + mulhdu r12, r10, r6 + subfc r11, r9, r7 + subfe r7, r8, r11 + std r11, 0(r3) +L(e1): subfc r11, r5, r7 + std r11, 8(r3) + subfe r3, r12, r11 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm new file mode 100644 index 0000000..307aafc --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm @@ -0,0 +1,146 @@ +dnl PowerPC-64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb +dnl divisor. + +dnl Copyright 2006, 2010, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 +C POWER8 12 12 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`d', `r6') +define(`di', `r7') +define(`cnt',`r8') + +define(`tnc',`r10') + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_bdiv_q_1,toc) + addi r7, n, -1 + cmpdi cr1, n, 1 + ld r12, 0(up) + li cnt, 0 + neg r0, d + and r0, d, r0 + cntlzd r0, r0 + subfic cnt, r0, 63 + srd d, d, cnt +L(7): + mtctr r7 + LEA( r10, binvert_limb_table) + rldicl r11, d, 63, 57 + lbzx r0, r10, r11 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r10, r0, r0 + sldi r0, r0, 1 + mulld r10, d, r10 + subf r0, r10, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf di, r9, r0 C di = 1/d mod 2^64 +ifdef(`AIX', +` C For AIX it is not clear how to jump into another function. + b .mpn_pi1_bdiv_q_1 +',` + C For non-AIX, dispatch into the pi1 variant. + bne cr0, L(norm) + b L(unorm) +') +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + cmpdi cr0, cnt, 0 + ld r12, 0(up) + addic r0, n, -1 C set carry as side effect + cmpdi cr1, n, 1 + mtctr r0 + beq cr0, L(norm) + +L(unorm): + subfic tnc, cnt, 64 C set carry as side effect + li r5, 0 + srd r11, r12, cnt + beq cr1, L(ed1) + + ALIGN(16) +L(tpu): ld r12, 8(up) + nop + addi up, up, 8 + sld r0, r12, tnc + or r11, r11, r0 + subfe r9, r5, r11 + srd r11, r12, cnt + mulld r0, di, r9 + mulhdu r5, r0, d + std r0, 0(rp) + addi rp, rp, 8 + bdnz L(tpu) + + subfe r11, r5, r11 +L(ed1): mulld r0, di, r11 + std r0, 0(rp) + blr + + ALIGN(16) +L(norm): + mulld r11, r12, di + mulhdu r5, r11, d + std r11, 0(rp) + beqlr cr1 + + ALIGN(16) +L(tpn): ld r9, 8(up) + addi up, up, 8 + subfe r5, r5, r9 + mulld r11, di, r5 + mulhdu r5, r11, d C result not used in last iteration + std r11, 8(rp) + addi rp, rp, 8 + bdnz L(tpn) + + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm new file mode 100644 index 0000000..24968c1 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm @@ -0,0 +1,196 @@ +dnl PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.25 +C POWER5 ? +C POWER6 3 +C POWER7 2 + +C INPUT PARAMETERS +define(`cnd', `r3') +define(`rp', `r4') +define(`up', `r5') +define(`vp', `r6') +define(`n', `r7') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_cnd_add_n) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_cnd_sub_n) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + subfic cnd, cnd, 0 + subfe cnd, cnd, cnd + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(up) C load s1 limb + ld r9, 0(vp) C load s2 limb + ld r10, 8(up) C load s1 limb + ld r11, 8(vp) C load s2 limb + ld r12, 16(up) C load s1 limb + addi up, up, 24 + ld r0, 16(vp) C load s2 limb + addi vp, vp, 24 + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(rp) + std r30, 8(rp) + std r31, 16(rp) + addi rp, rp, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(up) C load s1 limb + addi up, up, 8 + ld r0, 0(vp) C load s2 limb + addi vp, vp, 8 + and r0, r0, cnd + ADDSUB r31, r0, r12 C add + std r31, 0(rp) + addi rp, rp, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(up) C load s1 limb + ld r11, 0(vp) C load s2 limb + ld r12, 8(up) C load s1 limb + addi up, up, 16 + ld r0, 8(vp) C load s2 limb + addi vp, vp, 16 + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(rp) + std r31, 8(rp) + addi rp, rp, 16 + bdnz L(go) + b L(ret) + +L(b00): CLRCB C clear/set cy +L(go): ld r7, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdz L(end) + + addi up, up, 32 + addi vp, vp, 32 + +L(top): ADDSUBC r28, r27, r7 + ld r7, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + std r28, 0(rp) + addi up, up, 32 + std r29, 8(rp) + addi vp, vp, 32 + std r30, 16(rp) + std r31, 24(rp) + addi rp, rp, 32 + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r27, r7 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(rp) + std r29, 8(rp) + std r30, 16(rp) + std r31, 24(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm new file mode 100644 index 0000000..c2d10bd --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm @@ -0,0 +1,135 @@ +dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2006, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 +C POWER8 12 12 + +C TODO +C * Check if n=1 code is really an improvement. It probably isn't. +C * Make more similar to mode1o.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`d', `r6') + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_divexact_1,toc) + addic. n, n, -1 + ld r12, 0(up) + bne cr0, L(2) + divdu r0, r12, d + std r0, 0(rp) + blr +L(2): + rldicl. r0, d, 0, 63 + li r10, 0 + bne cr0, L(7) + neg r0, d + and r0, d, r0 + cntlzd r0, r0 + subfic r0, r0, 63 + rldicl r10, r0, 0, 32 + srd d, d, r0 +L(7): + mtctr n + LEA( r5, binvert_limb_table) + rldicl r11, d, 63, 57 + lbzx r0, r5, r11 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r5, r0, r0 + sldi r0, r0, 1 + mulld r5, d, r5 + subf r0, r5, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r7, r9, r0 C r7 = 1/d mod 2^64 + + bne cr0, L(norm) + subfic r8, r10, 64 C set carry as side effect + li r5, 0 + srd r11, r12, r10 + + ALIGN(16) +L(loop0): + ld r12, 8(up) + nop + addi up, up, 8 + sld r0, r12, r8 + or r11, r11, r0 + subfe r9, r5, r11 + srd r11, r12, r10 + mulld r0, r7, r9 + mulhdu r5, r0, d + std r0, 0(rp) + addi rp, rp, 8 + bdnz L(loop0) + + subfe r0, r5, r11 + mulld r0, r7, r0 + std r0, 0(rp) + blr + + ALIGN(16) +L(norm): + mulld r11, r12, r7 + mulhdu r5, r11, d + std r11, 0(rp) + ALIGN(16) +L(loop1): + ld r9, 8(up) + addi up, up, 8 + subfe r5, r5, r9 + mulld r11, r7, r5 + mulhdu r5, r11, d C result not used in last iteration + std r11, 8(rp) + addi rp, rp, 8 + bdnz L(loop1) + + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm new file mode 100644 index 0000000..b283877 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm @@ -0,0 +1,274 @@ +dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. + +dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm frac +C POWER3/PPC630 16-34 16-34 ~11 outdated figures +C POWER4/PPC970 28 28 19 +C POWER5 29 29 ~19 +C POWER6 49 59 ~42 +C POWER7 24.5 23 ~14 + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C d = r7 + +C We use a not very predictable branch in the frac code, therefore the cycle +C count wobbles somewhat. With the alternative branch-free code, things run +C considerably slower on POWER4/PPC970 and POWER5. + +C Add preinv entry point. + + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_divrem_1,toc) + + mfcr r12 + add. r10, r6, r4 + std r25, -56(r1) + mr r25, r4 + mflr r0 + std r26, -48(r1) + mr r26, r5 + std r28, -32(r1) + mr r28, r6 + std r29, -24(r1) + mr r29, r3 + li r3, 0 + std r30, -16(r1) + mr r30, r7 + std r31, -8(r1) + li r31, 0 + std r27, -40(r1) + std r0, 16(r1) + stw r12, 8(r1) + stdu r1, -176(r1) + beq- cr0, L(1) + cmpdi cr7, r7, 0 + sldi r0, r10, 3 + add r11, r0, r29 + addi r29, r11, -8 + blt- cr7, L(162) + cmpdi cr4, r6, 0 + beq+ cr4, L(71) +L(163): + sldi r9, r6, 3 + add r9, r9, r5 + ld r7, -8(r9) + cmpld cr7, r7, r30 + bge- cr7, L(71) + cmpdi cr7, r10, 1 + li r0, 0 + mr r31, r7 + std r0, -8(r11) + addi r29, r29, -8 + mr r3, r7 + beq- cr7, L(1) + addi r28, r6, -1 + cmpdi cr4, r28, 0 +L(71): + cntlzd r27, r30 + sld r30, r30, r27 + sld r31, r31, r27 + mr r3, r30 + CALL( mpn_invert_limb) + beq- cr4, L(110) + sldi r9, r28, 3 + addic. r6, r28, -2 + add r9, r9, r26 + subfic r5, r27, 64 + ld r8, -8(r9) + srd r0, r8, r5 + or r31, r31, r0 + sld r7, r8, r27 + blt- cr0, L(154) + addi r28, r28, -1 + mtctr r28 + sldi r6, r6, 3 + ALIGN(16) +L(uloop): + ldx r8, r26, r6 + nop + mulld r0, r31, r3 + mulhdu r10, r31, r3 + addi r11, r31, 1 + srd r9, r8, r5 + addi r6, r6, -8 + or r9, r7, r9 + addc r0, r0, r9 + adde r10, r10, r11 + mulld r31, r10, r30 + subf r31, r31, r9 + subfc r0, r31, r0 C r <= ql + subfe r0, r0, r0 C r0 = -(r <= ql) + and r9, r30, r0 + add r31, r31, r9 + add r10, r0, r10 C qh -= (r >= ql) + cmpld cr7, r31, r30 + bge- cr7, L(164) +L(123): + std r10, 0(r29) + addi r29, r29, -8 + sld r7, r8, r27 + bdnz L(uloop) +L(154): + addi r11, r31, 1 + nop + mulld r0, r31, r3 + mulhdu r8, r31, r3 + addc r0, r0, r7 + adde r8, r8, r11 + mulld r31, r8, r30 + subf r31, r31, r7 + subfc r0, r0, r31 C r >= ql + subfe r0, r0, r0 C r0 = -(r >= ql) + not r7, r0 + add r8, r7, r8 C qh -= (r >= ql) + andc r0, r30, r0 + add r31, r31, r0 + cmpld cr7, r31, r30 + bge- cr7, L(165) +L(134): + std r8, 0(r29) + addi r29, r29, -8 +L(110): + addic. r0, r25, -1 + blt- cr0, L(156) + mtctr r25 + neg r9, r30 + ALIGN(16) +L(ufloop): + addi r11, r31, 1 + nop + mulld r0, r3, r31 + mulhdu r10, r3, r31 + add r10, r10, r11 + mulld r31, r9, r10 +ifelse(0,1,` + subfc r0, r0, r31 + subfe r0, r0, r0 C r0 = -(r >= ql) + not r7, r0 + add r10, r7, r10 C qh -= (r >= ql) + andc r0, r30, r0 + add r31, r31, r0 +',` + cmpld cr7, r31, r0 + blt cr7, L(29) + add r31, r30, r31 + addi r10, r10, -1 +L(29): +') + std r10, 0(r29) + addi r29, r29, -8 + bdnz L(ufloop) +L(156): + srd r3, r31, r27 +L(1): + addi r1, r1, 176 + ld r0, 16(r1) + lwz r12, 8(r1) + mtlr r0 + ld r25, -56(r1) + ld r26, -48(r1) + mtcrf 8, r12 + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +L(162): + cmpdi cr7, r6, 0 + beq- cr7, L(8) + sldi r9, r6, 3 + addi r29, r29, -8 + add r9, r9, r5 + addi r28, r6, -1 + ld r31, -8(r9) + subfc r9, r7, r31 + li r9, 0 + adde r9, r9, r9 + neg r0, r9 + std r9, -8(r11) + and r0, r0, r7 + subf r31, r0, r31 +L(8): + mr r3, r30 + CALL( mpn_invert_limb) + li r27, 0 + addic. r6, r28, -1 + blt- cr0, L(110) + mtctr r28 + sldi r6, r6, 3 + ALIGN(16) +L(nloop): + addi r11, r31, 1 + ldx r8, r26, r6 + mulld r0, r31, r3 + mulhdu r10, r31, r3 + addi r6, r6, -8 + addc r0, r0, r8 + adde r10, r10, r11 + mulld r31, r10, r30 + subf r31, r31, r8 C r = nl - qh * d + subfc r0, r31, r0 C r <= ql + subfe r0, r0, r0 C r0 = -(r <= ql) + and r9, r30, r0 + add r31, r31, r9 + add r10, r0, r10 C qh -= (r >= ql) + cmpld cr7, r31, r30 + bge- cr7, L(167) +L(51): + std r10, 0(r29) + addi r29, r29, -8 + bdnz L(nloop) + b L(110) + +L(164): + subf r31, r30, r31 + addi r10, r10, 1 + b L(123) +L(167): + subf r31, r30, r31 + addi r10, r10, 1 + b L(51) +L(165): + subf r31, r30, r31 + addi r8, r8, 1 + b L(134) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm new file mode 100644 index 0000000..752c3d6 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm @@ -0,0 +1,187 @@ +dnl PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm frac +C POWER3/PPC630 +C POWER4/PPC970 ? ? +C POWER5 37 ? +C POWER6 62 ? +C POWER6 30.5 ? + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C dp = r7 + + +ifdef(`DARWIN',,` +define(`r2',`r31')') C FIXME! + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_divrem_2,toc) + mflr r0 + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + std r0, 16(r1) + stdu r1, -192(r1) + mr r24, r3 + mr r25, r4 + sldi r0, r6, 3 + add r26, r5, r0 + addi r26, r26, -24 + ld r30, 8(r7) + ld r28, 0(r7) + ld r29, 16(r26) + ld r31, 8(r26) + +ifelse(0,1,` + li r23, 0 + cmpld cr7, r29, r30 + blt cr7, L(8) + bgt cr7, L(9) + cmpld cr0, r31, r28 + blt cr0, L(8) +L(9): subfc r31, r28, r31 + subfe r29, r30, r29 + li r23, 1 +',` + li r23, 0 + cmpld cr7, r29, r30 + blt cr7, L(8) + mfcr r0 + rlwinm r0, r0, 30, 31, 31 + subfc r9, r28, r31 + addze. r0, r0 + nop + beq cr0, L(8) + subfc r31, r28, r31 + subfe r29, r30, r29 + li r23, 1 +') + +L(8): + add r27, r25, r6 + addic. r27, r27, -3 + blt cr0, L(18) + mr r3, r30 + CALL( mpn_invert_limb) + mulld r10, r3, r30 + mulhdu r0, r3, r28 + addc r8, r10, r28 + subfe r11, r1, r1 + addc r10, r8, r0 + addze. r11, r11 + blt cr0, L(91) +L(40): + subfc r10, r30, r10 + addme. r11, r11 + addi r3, r3, -1 + bge cr0, L(40) +L(91): + addi r5, r27, 1 + mtctr r5 + sldi r0, r27, 3 + add r24, r24, r0 + ALIGN(16) +L(loop): + mulhdu r8, r29, r3 + mulld r6, r29, r3 + addc r6, r6, r31 + adde r8, r8, r29 + cmpd cr7, r27, r25 + mulld r0, r30, r8 + mulhdu r11, r28, r8 + mulld r10, r28, r8 + subf r31, r0, r31 + li r7, 0 + blt cr7, L(60) + ld r7, 0(r26) + addi r26, r26, -8 + nop +L(60): subfc r7, r28, r7 + subfe r31, r30, r31 + subfc r7, r10, r7 + subfe r4, r11, r31 + subfc r9, r6, r4 + subfe r9, r1, r1 + andc r6, r28, r9 + andc r0, r30, r9 + addc r31, r7, r6 + adde r29, r4, r0 + subf r8, r9, r8 + cmpld cr7, r29, r30 + bge- cr7, L(fix) +L(bck): std r8, 0(r24) + addi r24, r24, -8 + addi r27, r27, -1 + bdnz L(loop) +L(18): + std r31, 8(r26) + std r29, 16(r26) + mr r3, r23 + addi r1, r1, 192 + ld r0, 16(r1) + mtlr r0 + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +L(fix): + mfcr r0 + rlwinm r0, r0, 30, 31, 31 + subfc r9, r28, r31 + addze. r0, r0 + beq cr0, L(bck) + subfc r31, r28, r31 + subfe r29, r30, r29 + addi r8, r8, 1 + b L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm new file mode 100644 index 0000000..f9792e5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm @@ -0,0 +1,77 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 ? +C POWER4/PPC970 8.5 obsolete +C POWER5 ? +C POWER6 ? +C POWER7 9.4 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`mask', `r0')dnl +define(`a1', `r4')dnl +define(`a2', `r5')dnl +define(`d1', `r6')dnl +define(`d2', `r7')dnl +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + mr r8, v0 + subf. r10, u0, v0 C r10 = d - a + beq L(end) + + ALIGN(16) +L(top): subfc r11, r8, r3 C r11 = a - d + and d2, r11, r10 + subfe mask, mask, mask + cntlzd cnt, d2 + and a1, r10, mask C d - a + andc a2, r11, mask C a - d + and d1, r3, mask C a + andc d2, r8, mask C d + or r3, a1, a2 C new a + subf cnt, cnt, r12 + or r8, d1, d2 C new d + srd r3, r3, cnt + subf. r10, r3, r8 C r10 = d - a + bne L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h new file mode 100644 index 0000000..f8305f4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h @@ -0,0 +1,82 @@ +/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1600MHz PPC970 */ + +/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 135 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 74 +#define SQR_TOOM4_THRESHOLD 136 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 234 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 33 +#define POWM_THRESHOLD 89 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 93 +#define GCD_DC_THRESHOLD 237 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 1 + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1_THRESHOLD 6 +#define MOD_1_2_THRESHOLD 9 +#define MOD_1_4_THRESHOLD 23 +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1713 + +#define MUL_FFT_TABLE { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 } +#define MUL_FFT_MODF_THRESHOLD 304 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_TABLE { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 } +#define SQR_FFT_MODF_THRESHOLD 272 +#define SQR_FFT_THRESHOLD 2688 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm new file mode 100644 index 0000000..dfdba64 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm @@ -0,0 +1,88 @@ +dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb (approximate) +C POWER3/PPC630 80 +C POWER4/PPC970 86 +C POWER5 86 +C POWER6 170 +C POWER7 66 + +ASM_START() +PROLOGUE(mpn_invert_limb,toc) + LEAL( r12, approx_tab) + srdi r9, r3, 32 + rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe + srdi r10, r3, 24 C d >> 24 + lis r11, 0x1000 + rldicl r8, r3, 0, 63 C d mod 2 + addi r10, r10, 1 C d40 + sldi r11, r11, 32 C 2^60 + srdi r7, r3, 1 C d/2 + add r7, r7, r8 C d63 = ceil(d/2) + neg r8, r8 C mask = -(d mod 2) + lhzx r0, r9, r12 + mullw r9, r0, r0 C v0*v0 + sldi r6, r0, 11 C v0 << 11 + addi r0, r6, -1 C (v0 << 11) - 1 + mulld r9, r9, r10 C v0*v0*d40 + srdi r9, r9, 40 C v0*v0*d40 >> 40 + subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 + mulld r0, r9, r10 C v1*d40 + sldi r6, r9, 13 C v1 << 13 + subf r0, r0, r11 C 2^60 - v1*d40 + mulld r0, r0, r9 C v1 * (2^60 - v1*d40) + srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47 + add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) + mulld r11, r0, r7 C v2 * d63 + srdi r10, r0, 1 C v2 >> 1 + sldi r9, r0, 31 C v2 << 31 + and r8, r10, r8 C (v2 >> 1) & mask + subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63 + mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63) + srdi r0, r0, 1 C p1 >> 1 + add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1) + nop + mulld r11, r0, r3 + mulhdu r9, r0, r3 + addc r10, r11, r3 + adde r3, r9, r3 + subf r3, r3, r0 + blr +EPILOGUE() + +DEF_OBJECT(approx_tab) +forloop(i,256,512-1,dnl +` .short eval(0x7fd00/i) +')dnl +END_OBJECT(approx_tab) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm new file mode 100644 index 0000000..8733730 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm @@ -0,0 +1,164 @@ +dnl PowerPC-64 mpn_mod_1_1p + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 17 +C POWER5 16 +C POWER6 30 +C POWER7 10.2 + +C TODO +C * Optimise, in particular the cps function. This was compiler-generated and +C then hand optimised. + +C INPUT PARAMETERS +define(`ap', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cps', `r6') + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_mod_1_1p) + sldi r10, r4, 3 + addi r4, r4, -1 + add r3, r3, r10 + ld r0, 16(r6) C B1modb + ld r12, 24(r6) C B2modb + ld r9, -8(r3) + ld r10, -16(r3) + mtctr r4 + mulhdu r8, r9, r0 + mulld r7, r9, r0 + addc r11, r7, r10 + addze r9, r8 + bdz L(end) + + ALIGN(16) +L(top): ld r4, -24(r3) + addi r3, r3, -8 + nop + mulld r10, r11, r0 + mulld r8, r9, r12 + mulhdu r11, r11, r0 + mulhdu r9, r9, r12 + addc r7, r10, r4 + addze r10, r11 + addc r11, r8, r7 + adde r9, r9, r10 + bdnz L(top) + +L(end): +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` lwz r0, 8(r6)', +` lwz r0, 12(r6)') + ld r3, 0(r6) + cmpdi cr7, r0, 0 + beq- cr7, L(4) + subfic r10, r0, 64 + sld r9, r9, r0 + srd r10, r11, r10 + or r9, r10, r9 +L(4): subfc r10, r5, r9 + subfe r10, r10, r10 + nand r10, r10, r10 + sld r11, r11, r0 + and r10, r10, r5 + subf r9, r10, r9 + mulhdu r10, r9, r3 + mulld r3, r9, r3 + addi r9, r9, 1 + addc r8, r3, r11 + adde r3, r10, r9 + mulld r3, r3, r5 + subf r3, r3, r11 + cmpld cr7, r8, r3 + bge cr7, L(5) C FIXME: Make branch-less + add r3, r3, r5 +L(5): cmpld cr7, r3, r5 + bge- cr7, L(10) + srd r3, r3, r0 + blr + +L(10): subf r3, r5, r3 + srd r3, r3, r0 + blr +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps,toc) + mflr r0 + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + cntlzd r31, r4 + std r0, 16(r1) + extsw r31, r31 + mr r29, r3 + stdu r1, -144(r1) + sld r30, r4, r31 + mr r3, r30 + CALL( mpn_invert_limb) + cmpdi cr7, r31, 0 + neg r0, r30 + beq- cr7, L(13) + subfic r11, r31, 64 + li r0, 1 + neg r9, r30 + srd r11, r3, r11 + sld r0, r0, r31 + or r0, r11, r0 + mulld r0, r0, r9 +L(13): mulhdu r9, r0, r3 + mulld r11, r0, r3 + add r9, r0, r9 + nor r9, r9, r9 + mulld r9, r9, r30 + cmpld cr7, r11, r9 + bge cr7, L(14) + add r9, r9, r30 +L(14): addi r1, r1, 144 + srd r0, r0, r31 + std r31, 8(r29) + std r3, 0(r29) + std r0, 16(r29) + ld r0, 16(r1) + srd r9, r9, r31 + ld r30, -16(r1) + ld r31, -8(r1) + std r9, 24(r29) + ld r29, -24(r1) + mtlr r0 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm new file mode 100644 index 0000000..0b7d6bf --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm @@ -0,0 +1,270 @@ +dnl PowerPC-64 mpn_mod_1s_4p + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 9 +C POWER5 9 +C POWER6 13 +C POWER7 3.5 + +C TODO +C * Optimise, in particular the cps function. This was compiler-generated and +C then hand optimised. + +C INPUT PARAMETERS +define(`ap', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cps', `r6') + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_mod_1s_4p) + std r23, -72(r1) + ld r23, 48(cps) + std r24, -64(r1) + std r25, -56(r1) + ld r24, 32(cps) + ld r25, 24(cps) + std r26, -48(r1) + std r27, -40(r1) + ld r26, 16(cps) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + ld r30, 40(cps) + + rldicl. r0, n, 0,62 + sldi r31, n, 3 + add ap, ap, r31 C make ap point at end of operand + + cmpdi cr7, r0, 2 + beq cr0, L(b00) + blt cr7, L(b01) + beq cr7, L(b10) + +L(b11): ld r11, -16(ap) + ld r9, -8(ap) + ld r0, -24(ap) + mulhdu r27, r11, r26 + mulld r8, r11, r26 + mulhdu r11, r9, r25 + mulld r9, r9, r25 + addc r31, r8, r0 + addze r10, r27 + addc r0, r9, r31 + adde r9, r11, r10 + addi ap, ap, -40 + b L(6) + + ALIGN(16) +L(b00): ld r11, -24(ap) + ld r10, -16(ap) + ld r9, -8(ap) + ld r0, -32(ap) + mulld r8, r11, r26 + mulhdu r7, r10, r25 + mulhdu r27, r11, r26 + mulhdu r11, r9, r24 + mulld r10, r10, r25 + mulld r9, r9, r24 + addc r31, r8, r0 + addze r0, r27 + addc r8, r31, r10 + adde r10, r0, r7 + addc r0, r9, r8 + adde r9, r11, r10 + addi ap, ap, -48 + b L(6) + + ALIGN(16) +L(b01): li r9, 0 + ld r0, -8(ap) + addi ap, ap, -24 + b L(6) + + ALIGN(16) +L(b10): ld r9, -8(ap) + ld r0, -16(ap) + addi ap, ap, -32 + + ALIGN(16) +L(6): addi r10, n, 3 + srdi r7, r10, 2 + mtctr r7 + bdz L(end) + + ALIGN(16) +L(top): ld r31, -16(ap) + ld r10, -8(ap) + ld r11, 8(ap) + ld r12, 0(ap) + mulld r29, r0, r30 C rl * B4modb + mulhdu r0, r0, r30 C rl * B4modb + mulhdu r27, r10, r26 + mulld r10, r10, r26 + mulhdu r7, r9, r23 C rh * B5modb + mulld r9, r9, r23 C rh * B5modb + mulhdu r28, r11, r24 + mulld r11, r11, r24 + mulhdu r4, r12, r25 + mulld r12, r12, r25 + addc r8, r10, r31 + addze r10, r27 + addi ap, ap, -32 + addc r27, r8, r12 + adde r12, r10, r4 + addc r11, r27, r11 + adde r31, r12, r28 + addc r12, r11, r29 + adde r4, r31, r0 + addc r0, r9, r12 + adde r9, r7, r4 + bdnz L(top) + +L(end): +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` lwz r3, 8(cps)', +` lwz r3, 12(cps)') + mulld r10, r9, r26 + mulhdu r9, r9, r26 + addc r11, r0, r10 + addze r9, r9 + ld r10, 0(cps) + subfic r8, r3, 64 + sld r9, r9, r3 + srd r8, r11, r8 + sld r11, r11, r3 + or r9, r8, r9 + mulld r0, r9, r10 + mulhdu r10, r9, r10 + addi r9, r9, 1 + addc r8, r0, r11 + adde r0, r10, r9 + mulld r0, r0, d + subf r0, r0, r11 + cmpld cr7, r8, r0 + bge cr7, L(9) + add r0, r0, d +L(9): cmpld cr7, r0, d + bge- cr7, L(16) +L(10): srd r3, r0, r3 + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr + +L(16): subf r0, d, r0 + b L(10) +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps,toc) + mflr r0 + std r29, -24(r1) + std r30, -16(r1) + mr r29, r3 + std r0, 16(r1) + std r31, -8(r1) + stdu r1, -144(r1) + cntlzd r31, r4 + sld r30, r4, r31 + mr r3, r30 + CALL( mpn_invert_limb) + subfic r9, r31, 64 + li r10, 1 + sld r10, r10, r31 + srd r9, r3, r9 + neg r0, r30 + or r10, r10, r9 + mulld r10, r10, r0 + mulhdu r11, r10, r3 + nor r11, r11, r11 + subf r11, r10, r11 + mulld r11, r11, r30 + mulld r0, r10, r3 + cmpld cr7, r0, r11 + bge cr7, L(18) + add r11, r11, r30 +L(18): mulhdu r9, r11, r3 + add r9, r11, r9 + nor r9, r9, r9 + mulld r9, r9, r30 + mulld r0, r11, r3 + cmpld cr7, r0, r9 + bge cr7, L(19) + add r9, r9, r30 +L(19): mulhdu r0, r9, r3 + add r0, r9, r0 + nor r0, r0, r0 + mulld r0, r0, r30 + mulld r8, r9, r3 + cmpld cr7, r8, r0 + bge cr7, L(20) + add r0, r0, r30 +L(20): mulhdu r8, r0, r3 + add r8, r0, r8 + nor r8, r8, r8 + mulld r8, r8, r30 + mulld r7, r0, r3 + cmpld cr7, r7, r8 + bge cr7, L(21) + add r8, r8, r30 +L(21): srd r0, r0, r31 + addi r1, r1, 144 + srd r8, r8, r31 + srd r10, r10, r31 + srd r11, r11, r31 + std r0, 40(r29) + std r31, 8(r29) + srd r9, r9, r31 + ld r0, 16(r1) + ld r30, -16(r1) + std r8, 48(r29) + std r3, 0(r29) + mtlr r0 + ld r31, -8(r1) + std r10, 16(r29) + std r11, 24(r29) + std r9, 32(r29) + ld r29, -24(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm new file mode 100644 index 0000000..c35e0e3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm @@ -0,0 +1,132 @@ +dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1. + +dnl Copyright 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.33 +C POWER4/PPC970 1.5 +C POWER5 1.32 +C POWER6 2.35 +C POWER7 1 + +C INPUT PARAMETERS +define(`up',`r3') +define(`n',`r4') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + li r8, 0 + li r9, 0 + li r10, 0 + li r11, 0 + + cmpdi cr6, n, 3 + blt cr6, L(lt3) + + li r0, -0x5556 C 0xFFFFFFFFFFFFAAAA + rldimi r0, r0, 16, 32 C 0xFFFFFFFFAAAAAAAA + rldimi r0, r0, 32, 63 C 0xAAAAAAAAAAAAAAAB + mulhdu r0, r0, n + srdi r0, r0, 1 C r0 = [n / 3] + mtctr r0 + + ld r5, 0(up) + ld r6, 8(up) + ld r7, 16(up) + addi up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): addc r8, r8, r5 + nop + ld r5, 0(up) + adde r9, r9, r6 + ld r6, 8(up) + adde r10, r10, r7 + ld r7, 16(up) + addi up, up, 48 + addze r11, r11 + bdz L(endx) + addc r8, r8, r5 + nop + ld r5, -24(up) + adde r9, r9, r6 + ld r6, -16(up) + adde r10, r10, r7 + ld r7, -8(up) + addze r11, r11 + bdnz L(top) + + addi up, up, 24 +L(endx): + addi up, up, -24 + +L(end): addc r8, r8, r5 + adde r9, r9, r6 + adde r10, r10, r7 + addze r11, r11 + + sldi r5, r0, 1 + add r5, r5, r0 C r11 = n / 3 * 3 + sub n, n, r5 C n = n mod 3 +L(lt3): cmpdi cr6, n, 1 + blt cr6, L(2) + + ld r5, 0(up) + addc r8, r8, r5 + li r6, 0 + beq cr6, L(1) + + ld r6, 8(up) +L(1): adde r9, r9, r6 + addze r10, r10 + addze r11, r11 + +L(2): rldicl r0, r8, 0, 16 C r0 = r8 mod 2^48 + srdi r3, r8, 48 C r3 = r8 div 2^48 + rldic r4, r9, 16, 16 C r4 = (r9 mod 2^32) << 16 + srdi r5, r9, 32 C r5 = r9 div 2^32 + rldic r6, r10, 32, 16 C r6 = (r10 mod 2^16) << 32 + srdi r7, r10, 16 C r7 = r10 div 2^16 + + add r0, r0, r3 + add r4, r4, r5 + add r6, r6, r7 + + add r0, r0, r4 + add r6, r6, r11 + + add r3, r0, r6 + blr +EPILOGUE() + +C |__r10__|__r9___|__r8___| +C |-----|-----|-----|-----| diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm new file mode 100644 index 0000000..726339a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm @@ -0,0 +1,117 @@ +dnl PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 +C POWER6 ? +C POWER7 12 + +C TODO +C * Check if n=1 code is really an improvement. It probably isn't. +C * Make more similar to dive_1.asm. + +C INPUT PARAMETERS +define(`up', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cy', `r6') + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_modexact_1c_odd,toc) + addic. n, n, -1 C set carry as side effect + ld r8, 0(up) + bne cr0, L(2) + cmpld cr7, r6, r8 + bge cr7, L(4) + subf r8, r6, r8 + divdu r3, r8, d + mulld r3, r3, d + subf. r3, r3, r8 + beqlr cr0 + subf r3, r3, d + blr + +L(4): subf r3, r8, r6 + divdu r8, r3, d + mulld r8, r8, d + subf r3, r8, r3 + blr + +L(2): LEA( r7, binvert_limb_table) + rldicl r9, d, 63, 57 + mtctr n + lbzx r0, r7, r9 + mulld r7, r0, r0 + sldi r0, r0, 1 + mulld r7, d, r7 + subf r0, r7, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r7, r0, r0 + sldi r0, r0, 1 + mulld r7, d, r7 + subf r9, r7, r0 + + ALIGN(16) +L(loop): + subfe r0, r6, r8 + ld r8, 8(up) + addi up, up, 8 + mulld r0, r9, r0 + mulhdu r6, r0, d + bdnz L(loop) + + cmpld cr7, d, r8 + blt cr7, L(10) + + subfe r0, r0, r0 + subf r6, r0, r6 + cmpld cr7, r6, r8 + subf r3, r8, r6 + bgelr cr7 + add r3, d, r3 + blr + +L(10): subfe r0, r6, r8 + mulld r0, r9, r0 + mulhdu r3, r0, d + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm new file mode 100644 index 0000000..27a8f8f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm @@ -0,0 +1,168 @@ +dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 7.25? not updated for last file revision +C POWER5 7.25 +C POWER6 14 +C POWER7 2.9 + +C TODO +C * Try to reduce the number of needed live registers (at least r5 and r10 +C could be combined) +C * Optimize feed-in code, for speed and size. +C * Clean up r12/r7 usage in feed-in code. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vl', `r6') + +ASM_START() +PROLOGUE(mpn_mul_1c) + std r27, -40(r1) + std r26, -48(r1) + mr r12, r7 + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + std r27, -40(r1) + std r26, -48(r1) + li r12, 0 C cy_limb = 0 +L(ent): ld r26, 0(up) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): mr r7, r12 + mulld r0, r26, r6 + mulhdu r12, r26, r6 + addi up, up, 8 + addc r0, r0, r7 + std r0, 0(rp) + addi rp, rp, 8 + b L(fic) + +L(b00): ld r27, 8(up) + addi up, up, 16 + mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r12 + adde r7, r7, r5 + addze r12, r8 + std r0, 0(rp) + std r7, 8(rp) + addi rp, rp, 16 + b L(fic) + + nop C alignment +L(b01): bdnz L(gt1) + mulld r0, r26, r6 + mulhdu r8, r26, r6 + addc r0, r0, r12 + std r0, 0(rp) + b L(ret) +L(gt1): ld r27, 8(up) + nop + mulld r0, r26, r6 + mulhdu r5, r26, r6 + ld r26, 16(up) + mulld r7, r27, r6 + mulhdu r8, r27, r6 + mulld r9, r26, r6 + mulhdu r10, r26, r6 + addc r0, r0, r12 + adde r7, r7, r5 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r7, 8(rp) + std r9, 16(rp) + addi up, up, 24 + addi rp, rp, 24 + b L(fic) + + nop +L(fic): ld r26, 0(up) +L(b10): ld r27, 8(up) + addi up, up, 16 + bdz L(end) + +L(top): mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r26, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r5 + mulld r9, r26, r6 + mulhdu r10, r26, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r26, 16(up) + ld r27, 24(up) + std r0, 0(rp) + adde r9, r9, r8 + std r7, 8(rp) + adde r11, r11, r10 + std r9, 16(rp) + addi up, up, 32 + std r11, 24(rp) + + addi rp, rp, 32 + bdnz L(top) + +L(end): mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r5 + std r0, 0(rp) + std r7, 8(rp) +L(ret): addze r3, r8 + ld r27, -40(r1) + ld r26, -48(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm new file mode 100644 index 0000000..1873187 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm @@ -0,0 +1,708 @@ +dnl PowerPC-64 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 24 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r25') +define(`outer_rp', `r22') +define(`outer_up', `r23') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + +C Special code for un <= 2, for efficiency of these important cases, +C and since it simplifies the default code. + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + nop + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + + mr outer_rp, rp + mr outer_up, up + + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + + rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi un, un, 1 C compute count... + srdi un, un, 2 C ...for ctr + mtctr un C copy inner loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + + + ALIGN(16) +L(b3): mulld r0, r26, v0 + mulhdu r12, r26, v0 + addic r0, r0, 0 + std r0, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_3) + + ALIGN(16) +L(lo_m_3): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_3) + + ALIGN(16) +L(end_m_3): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + addic. vn, vn, -1 + beq L(ret) + + ALIGN(16) +L(outer_lo_3): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 8 + mr up, outer_up + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + ld r28, 0(rp) + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addc r0, r0, r28 + std r0, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_3) + + ALIGN(16) C registers dying +L(lo_3): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_3) C + + ALIGN(16) +L(end_3): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + + addic. vn, vn, -1 + bne L(outer_lo_3) + b L(ret) + + + ALIGN(16) +L(b0): ld r27, 8(up) + addi up, up, 8 + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + std r0, 0(rp) + std r24, 8(rp) + addi rp, rp, 8 + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_0) + + ALIGN(16) +L(lo_m_0): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_0) + + ALIGN(16) +L(end_m_0): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_0): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 16 + addi up, outer_up, 8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -8(up) + ld r27, 0(up) + ld r28, -8(rp) + ld r29, 0(rp) + nop + nop + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + addc r0, r0, r28 + std r0, -8(rp) + adde r24, r24, r29 + std r24, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_0) + + ALIGN(16) C registers dying +L(lo_0): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_0) C + + ALIGN(16) +L(end_0): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_0) + b L(ret) + + + ALIGN(16) +L(b1): ld r27, 8(up) + nop + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 16(up) + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r24, 8(rp) + std r9, 16(rp) + addi up, up, 16 + addi rp, rp, 16 + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_1) + + ALIGN(16) +L(lo_m_1): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_1) + + ALIGN(16) +L(end_m_1): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_1): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 24 + addi up, outer_up, 16 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -16(up) + ld r27, -8(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 0(up) + ld r28, -16(rp) + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, -8(rp) + ld r30, 0(rp) + mulld r9, r26, v0 + mulhdu r10, r26, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, -16(rp) + adde r24, r24, r29 + std r24, -8(rp) + adde r9, r9, r30 + std r9, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_1) + + ALIGN(16) C registers dying +L(lo_1): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_1) C + + ALIGN(16) +L(end_1): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_1) + b L(ret) + + + ALIGN(16) +L(b2): ld r27, 8(up) + addi up, up, -8 + addi rp, rp, -8 + li r12, 0 + addic r12, r12, 0 + + ALIGN(16) +L(lo_m_2): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + + addi rp, rp, 32 + bdnz L(lo_m_2) + + ALIGN(16) +L(end_m_2): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_2): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 0 + addi up, outer_up, -8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 8(up) + ld r27, 16(up) + li r12, 0 + addic r12, r12, 0 + + ALIGN(16) C registers dying +L(lo_2): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_2) C + + ALIGN(16) +L(end_2): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_2) + b L(ret) + + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h new file mode 100644 index 0000000..61a437b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h @@ -0,0 +1,179 @@ +/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 33 +#define MUL_TOOM44_THRESHOLD 46 +#define MUL_TOOM6H_THRESHOLD 77 +#define MUL_TOOM8H_THRESHOLD 139 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 47 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 45 +#define SQR_TOOM4_THRESHOLD 64 +#define SQR_TOOM6_THRESHOLD 85 +#define SQR_TOOM8_THRESHOLD 139 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \ + { 7, 7}, { 15, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 23,10}, { 15, 9}, \ + { 35, 8}, { 71,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 223,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,12}, { 223,11}, { 447,10}, { 895,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 120 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 188 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 188, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \ + { 13, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \ + { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \ + { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79, 8}, { 159,10}, { 47, 9}, { 95, 8}, \ + { 191,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79, 9}, { 159,11}, { 47,10}, { 95, 9}, \ + { 191,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511,10}, { 143, 9}, { 287,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \ + { 319,11}, { 639,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 447,11}, { 895,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 118 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 2511 + +#define DC_DIV_QR_THRESHOLD 23 +#define DC_DIVAPPR_Q_THRESHOLD 87 +#define DC_BDIV_QR_THRESHOLD 27 +#define DC_BDIV_Q_THRESHOLD 60 + +#define INV_MULMOD_BNM1_THRESHOLD 27 +#define INV_NEWTON_THRESHOLD 91 +#define INV_APPR_THRESHOLD 91 + +#define BINV_NEWTON_THRESHOLD 115 +#define REDC_1_TO_REDC_N_THRESHOLD 31 + +#define MU_DIV_QR_THRESHOLD 551 +#define MU_DIVAPPR_Q_THRESHOLD 551 +#define MUPI_DIV_QR_THRESHOLD 42 +#define MU_BDIV_QR_THRESHOLD 483 +#define MU_BDIV_Q_THRESHOLD 492 + +#define POWM_SEC_TABLE 2,23,140,556,713,746 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 56 +#define HGCD_APPR_THRESHOLD 51 +#define HGCD_REDUCE_THRESHOLD 688 +#define GCD_DC_THRESHOLD 333 +#define GCDEXT_DC_THRESHOLD 126 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 375 +#define SET_STR_PRECOMPUTE_THRESHOLD 812 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h new file mode 100644 index 0000000..3c40fb9 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h @@ -0,0 +1,214 @@ +/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008-2010, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz PPC970 */ +/* FFT tuning limit = 15 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 4.0 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define DIV_1_VS_MUL_1_PERCENT 218 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 136 +#define MUL_TOOM6H_THRESHOLD 197 +#define MUL_TOOM8H_THRESHOLD 272 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 76 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 13, 5}, { 28, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127,10}, { 87,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135, 9}, { 271,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95, 9}, { 383, 8}, \ + { 767,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,12}, \ + { 95,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,10}, { 895,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 895,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 159 +#define MUL_FFT_THRESHOLD 9088 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 14, 6}, \ + { 29, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 135, 9}, { 271, 8}, { 543,11}, \ + { 79, 9}, { 319, 8}, { 639,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767,10}, { 207, 9}, \ + { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \ + { 639,10}, { 335,11}, { 175,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,11}, { 895,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 607,13}, { 319,12}, \ + { 703,13}, { 383,12}, { 831,11}, { 1663,12}, \ + { 927,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,14}, { 383,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1663,14}, \ + { 895,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 174 +#define SQR_FFT_THRESHOLD 6272 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 18087 +#define SQRLO_BASECASE_THRESHOLD 2 +#define SQRLO_DC_THRESHOLD 79 +#define SQRLO_SQR_THRESHOLD 12322 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 159 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 165 + +#define BINV_NEWTON_THRESHOLD 198 +#define REDC_1_TO_REDC_N_THRESHOLD 56 + +#define MU_DIV_QR_THRESHOLD 1017 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 924 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define POWM_SEC_TABLE 7,17,86,579,1925 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_DC_THRESHOLD 788 +#define SET_STR_PRECOMPUTE_THRESHOLD 1713 + +#define FAC_DSC_THRESHOLD 512 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD_THRESHOLD 113 +#define HGCD_APPR_THRESHOLD 115 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 330 +#define GCDEXT_DC_THRESHOLD 242 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h new file mode 100644 index 0000000..15b009c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h @@ -0,0 +1,219 @@ +/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* POWER5 (friggms.hpc.ntnu.no) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 40 + +#define MUL_TOOM22_THRESHOLD 21 +#define MUL_TOOM33_THRESHOLD 24 +#define MUL_TOOM44_THRESHOLD 70 +#define MUL_TOOM6H_THRESHOLD 262 +#define MUL_TOOM8H_THRESHOLD 393 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 383,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575, 9}, { 1151,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,10}, { 4863,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,12}, \ + { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,11}, \ + { 4863,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3327,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,12}, { 11775,15}, { 1535,14}, \ + { 3327,15}, { 1791,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 208 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 272, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \ + { 31,10}, { 71, 9}, { 143,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143,11}, { 79,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,11}, { 175,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,10}, { 959,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,12}, { 287,11}, { 575,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 959,12}, { 1919,15}, { 255,14}, { 511,13}, \ + { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1663,13}, { 3327,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3327,15}, { 1791,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 190 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 6 +#define MULLO_DC_THRESHOLD 60 +#define MULLO_MUL_N_THRESHOLD 7463 + +#define DC_DIV_QR_THRESHOLD 58 +#define DC_DIVAPPR_Q_THRESHOLD 232 +#define DC_BDIV_QR_THRESHOLD 78 +#define DC_BDIV_Q_THRESHOLD 238 + +#define INV_MULMOD_BNM1_THRESHOLD 92 +#define INV_NEWTON_THRESHOLD 155 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 155 +#define REDC_1_TO_REDC_N_THRESHOLD 61 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 79 +#define MU_BDIV_QR_THRESHOLD 823 +#define MU_BDIV_Q_THRESHOLD 942 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 74 +#define HGCD_APPR_THRESHOLD 155 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 351 +#define GCDEXT_DC_THRESHOLD 288 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1585 + +#define FAC_DSC_THRESHOLD 662 +#define FAC_ODD_THRESHOLD 28 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm new file mode 100644 index 0000000..c572b91 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm @@ -0,0 +1,185 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 12.25 12.8 +C POWER7 ? ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Handle n = 1 and perhaps n = 2 separately, without saving any registers. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(AM, `$1') + define(SM, `') + define(CLRRSC, `addic $1, r0, 0') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(AM, `') + define(SM, `$1') + define(CLRRSC, `subfc $1, r0, r0') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): ld r8, 0(up) + ld r7, 8(up) + ld r27, 16(up) + addi up, up, 16 + addi rp, rp, 16 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r5, r5, r29 + b L(l3) + +L(b2): ld r7, 0(up) + ld r27, 8(up) + addi up, up, 8 + addi rp, rp, 8 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + ADDSUB r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ADDSUB r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + CLRRSC( r12) C clear r12 and clr/set cy + + ALIGN(32) +L(top): +SM(` subfe r11, r0, r0') C complement... +SM(` addic r11, r11, 1') C ...carry flag + ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r0, r0, r28 + std r0, -24(rp) + ADDSUBC r5, r5, r29 +L(l3): std r5, -16(rp) + ADDSUBC r9, r9, r30 +L(l2): std r9, -8(rp) + ADDSUBC r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + +AM(` addze r3, r12') +SM(` subfe r11, r0, r0') C complement... + ld r31, -8(r1) +SM(` subf r3, r11, r12') + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h new file mode 100644 index 0000000..c7e2f89 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -0,0 +1,160 @@ +/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3500 MHz POWER6 (kolga.bibsys.no) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 50 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 339 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 49 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 226 +#define SQR_TOOM8_THRESHOLD 272 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \ + { 33, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \ + { 31,10}, { 71,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \ + { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 79 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 280, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 47,11}, { 31,10}, { 71, 9}, \ + { 143,11}, { 47,12}, { 31,11}, { 63, 9}, \ + { 255, 8}, { 511, 9}, { 271,10}, { 143,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 80 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 2995 + +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 200 +#define DC_BDIV_QR_THRESHOLD 70 +#define DC_BDIV_Q_THRESHOLD 168 + +#define INV_MULMOD_BNM1_THRESHOLD 53 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 220 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 942 +#define MUPI_DIV_QR_THRESHOLD 57 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define POWM_SEC_TABLE 4,26,216,804,1731 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 106 +#define HGCD_APPR_THRESHOLD 109 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 327 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 537 +#define SET_STR_PRECOMPUTE_THRESHOLD 1576 + +#define FAC_DSC_THRESHOLD 426 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm new file mode 100644 index 0000000..3d32b46 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm @@ -0,0 +1,589 @@ +dnl PowerPC-64 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 12.25 + +C TODO +C * Reduce register usage. At least 4 register less can be used. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * The bdz insns for b1 and b2 will never branch, +C * Align things better, perhaps by moving things like pointer updates from +C before to after loops. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r25') +define(`outer_rp', `r22') +define(`outer_up', `r23') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + +C Special code for un <= 2, for efficiency of these important cases, +C and since it simplifies the default code. + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + nop + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + std r20, -96(r1) + + mr outer_rp, rp + mr outer_up, up + + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + + rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi un, un, 4 C compute count... + srdi un, un, 2 C ...for ctr + mtctr un C copy inner loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + + + ALIGN(16) +L(b3): + ld r27, 8(up) + ld r20, 16(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r10, r20, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r24, 8(rp) + std r9, 16(rp) + addi up, up, 16 + addi rp, rp, 16 + bdz L(end_m_3) + + ALIGN(32) +L(lo_m_3): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_3) + + ALIGN(16) +L(end_m_3): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_3): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 24 + addi up, outer_up, 16 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -16(up) + ld r27, -8(up) + ld r20, 0(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r10, r20, v0 + ld r28, -16(rp) + ld r29, -8(rp) + ld r30, 0(rp) + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, -16(rp) + adde r24, r24, r29 + std r24, -8(rp) + adde r9, r9, r30 + std r9, 0(rp) + bdz L(end_3) + + ALIGN(32) C registers dying +L(lo_3): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_3) C + + ALIGN(16) +L(end_3): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_3) + b L(ret) + + + ALIGN(16) +L(b1): + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addic r0, r0, 0 + std r0, 0(rp) + bdz L(end_m_1) + + ALIGN(16) +L(lo_m_1): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_1) + + ALIGN(16) +L(end_m_1): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_1): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 8 + mr up, outer_up + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + ld r28, 0(rp) + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addc r0, r0, r28 + std r0, 0(rp) + bdz L(end_1) + + ALIGN(32) C registers dying +L(lo_1): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_1) C + + ALIGN(16) +L(end_1): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_1) + b L(ret) + + + ALIGN(16) +L(b0): + addi up, up, -8 + addi rp, rp, -8 + li r12, 0 + addic r12, r12, 0 + bdz L(end_m_0) + + ALIGN(16) +L(lo_m_0): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_0) + + ALIGN(16) +L(end_m_0): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_0): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 0 + addi up, outer_up, -8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + li r12, 0 + addic r12, r12, 0 + bdz L(end_0) + + ALIGN(32) C registers dying +L(lo_0): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_0) C + + ALIGN(16) +L(end_0): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_0) + b L(ret) + + + ALIGN(16) +L(b2): ld r27, 8(up) + addi up, up, 8 + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + std r0, 0(rp) + std r24, 8(rp) + addi rp, rp, 8 + bdz L(end_m_2) + + ALIGN(16) +L(lo_m_2): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_2) + + ALIGN(16) +L(end_m_2): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_2): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 16 + addi up, outer_up, 8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -8(up) + ld r27, 0(up) + ld r28, -8(rp) + ld r29, 0(rp) + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + addc r0, r0, r28 + std r0, -8(rp) + adde r24, r24, r29 + std r24, 0(rp) + bdz L(end_2) + + ALIGN(16) C registers dying +L(lo_2): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_2) C + + ALIGN(16) +L(end_2): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_2) +C b L(ret) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + ld r20, -96(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm new file mode 100644 index 0000000..8731e01 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm @@ -0,0 +1,135 @@ +dnl PowerPC-64 mpn_mul_2 and mpn_addmul_2. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C mul_2 addmul_2 +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 ? ? +C POWER7-SMT4 3 3 +C POWER7-SMT2 ? ? +C POWER7-SMT1 ? ? + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vp', `r6') + +define(`cy0', `r10') +ifdef(`EXTRA_REGISTER', +` define(`cy1', EXTRA_REGISTER)', +` define(`cy1', `r31')') + +ifdef(`OPERATION_mul_2',` + define(`AM', `') + define(`ADDX', `addc') + define(`func', `mpn_mul_2') +') +ifdef(`OPERATION_addmul_2',` + define(`AM', `$1') + define(`ADDX', `adde') + define(`func', `mpn_addmul_2') +') + +MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) + +ASM_START() +PROLOGUE(func) + +ifdef(`EXTRA_REGISTER',,` + std r31, -8(r1) +') + andi. r12, n, 1 + addi r0, n, 1 + srdi r0, r0, 1 + mtctr r0 + ld r11, 0(vp) C v0 + li cy0, 0 + ld r12, 8(vp) C v1 + li cy1, 0 + ld r5, 0(up) + beq L(lo0) + addi up, up, -8 + addi rp, rp, -8 + b L(lo1) + + ALIGN(32) +L(top): +AM(` ld r0, -8(rp)') + ld r5, 0(up) +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + adde cy0, r7, cy1 + std r6, -8(rp) + addze cy1, r9 +L(lo0): mulld r6, r11, r5 C v0 * u[i] weight 0 + mulhdu r7, r11, r5 C v0 * u[i] weight 1 + mulld r8, r12, r5 C v1 * u[i] weight 1 + mulhdu r9, r12, r5 C v1 * u[i] weight 2 +AM(` ld r0, 0(rp)') + ld r5, 8(up) +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + adde cy0, r7, cy1 + std r6, 0(rp) + addze cy1, r9 +L(lo1): mulld r6, r11, r5 C v0 * u[i] weight 0 + mulhdu r7, r11, r5 C v0 * u[i] weight 1 + addi up, up, 16 + addi rp, rp, 16 + mulld r8, r12, r5 C v1 * u[i] weight 1 + mulhdu r9, r12, r5 C v1 * u[i] weight 2 + bdnz L(top) + +L(end): +AM(` ld r0, -8(rp)') +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + std r6, -8(rp) + adde cy0, r7, cy1 + addze cy1, r9 + std cy0, 0(rp) + mr r3, cy1 + +ifdef(`EXTRA_REGISTER',,` + ld r31, -8(r1) +') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm new file mode 100644 index 0000000..857c701 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm @@ -0,0 +1,128 @@ +dnl PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 2.18 + +C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an +C anomaly. + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): + andi. r7, n, 1 + beq L(bx0) + +L(bx1): ld r7, 0(up) + ld r9, 0(vp) + ADDSUBC r11, r9, r7 + std r11, 0(rp) + cmpldi cr6, n, 1 + beq cr6, L(end) + addi up, up, 8 + addi vp, vp, 8 + addi rp, rp, 8 + +L(bx0): addi r0, n, 2 C compute branch... + srdi r0, r0, 2 C ...count + mtctr r0 + + andi. r7, n, 2 + bne L(mid) + + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 16 + + ALIGN(32) +L(top): ld r6, -16(up) + ld r7, -8(up) + ld r8, -16(vp) + ld r9, -8(vp) + ADDSUBC r10, r8, r6 + ADDSUBC r11, r9, r7 + std r10, -16(rp) + std r11, -8(rp) +L(mid): ld r6, 0(up) + ld r7, 8(up) + ld r8, 0(vp) + ld r9, 8(vp) + ADDSUBC r10, r8, r6 + ADDSUBC r11, r9, r7 + std r10, 0(rp) + std r11, 8(rp) + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm new file mode 100644 index 0000000..ddf5fd8 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm new file mode 100644 index 0000000..3f9d88d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm new file mode 100644 index 0000000..5251202 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm @@ -0,0 +1,129 @@ +dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 2.5 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`DO_add', ` + define(`ADDSUBC', `addc $1, $2, $3') + define(`ADDSUBE', `adde $1, $2, $3') + define(INITCY, `addic $1, r1, 0') + define(RETVAL, `addze r3, $1') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUBC', `subfc $1, $2, $3') + define(`ADDSUBE', `subfe $1, $2, $3') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `subfze r3, $1 + neg r3, r3') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUBC', `subfc $1, $3, $2') + define(`ADDSUBE', `subfe $1, $3, $2') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `addme r3, $1') + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(`s0', `r0') define(`s1', `r9') +define(`u0', `r6') define(`u1', `r7') +define(`v0', `r10') define(`v1', `r11') + + +ASM_START() +PROLOGUE(func) + rldic r7, n, 3, 59 + add up, up, r7 + add vp, vp, r7 + add rp, rp, r7 + +ifdef(`DO_add', ` + addic r0, n, 3 C set cy flag as side effect +',` + subfc r0, r0, r0 C set cy flag + addi r0, n, 3 +') + srdi r0, r0, 2 + mtctr r0 + + andi. r0, n, 1 + beq L(bx0) + +L(bx1): andi. r0, n, 2 + li s0, 0 + bne L(lo3) + b L(lo1) + +L(bx0): andi. r0, n, 2 + li s1, 0 + bne L(lo2) + + ALIGN(32) +L(top): addi rp, rp, 32 + ld v0, 0(vp) + addi vp, vp, 32 + rldimi s1, v0, LSH, 0 + ld u0, 0(up) + addi up, up, 32 + srdi s0, v0, RSH + ADDSUBE(s1, s1, u0) + std s1, -32(rp) +L(lo3): ld v1, -24(vp) + rldimi s0, v1, LSH, 0 + ld u1, -24(up) + srdi s1, v1, RSH + ADDSUBE(s0, s0, u1) + std s0, -24(rp) +L(lo2): ld v0, -16(vp) + rldimi s1, v0, LSH, 0 + ld u0, -16(up) + srdi s0, v0, RSH + ADDSUBE(s1, s1, u0) + std s1, -16(rp) +L(lo1): ld v1, -8(vp) + rldimi s0, v1, LSH, 0 + ld u1, -8(up) + srdi s1, v1, RSH + ADDSUBE(s0, s0, u1) + std s0, -8(rp) + bdnz L(top) C decrement CTR and loop back + + RETVAL( s1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm new file mode 100644 index 0000000..f04e896 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm @@ -0,0 +1,67 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 7.6 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +C INPUT PARAMETERS +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + b L(odd) + + ALIGN(16) +L(top): and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r12 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): cmpld cr7, v0, u0 + subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm new file mode 100644 index 0000000..ade30e4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm @@ -0,0 +1,146 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER7 and POWER8. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 12.3 +C POWER8 13.4 +C POWER9 10.6 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) +L(top): subfc. t0, v0, u0 C 0 12 + beq cr0, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): and tmp, s0, t0 C 2 + cntlzd cnt, tmp C 4 + addi tmp, cnt, 1 C 6 + subfic cnt, cnt, 63 C 6 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel v1, v1, u1, 2 C 6 + isel u0, t0, s0, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + bne L(top) + + + li r0, 63 + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r0 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) C zero +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h new file mode 100644 index 0000000..9da4080 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h @@ -0,0 +1,175 @@ +/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3720 MHz POWER7/SMT4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-02, gcc 4.8 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 0 +/* From gcc110.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 8.45% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 341 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 71 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 298 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 139 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 105 +#define SQR_TOOM4_THRESHOLD 190 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 33, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 83 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 368, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 84 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 9449 +#define SQRLO_BASECASE_THRESHOLD 3 +#define SQRLO_DC_THRESHOLD 119 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 124 +#define DC_BDIV_QR_THRESHOLD 62 +#define DC_BDIV_Q_THRESHOLD 144 + +#define INV_MULMOD_BNM1_THRESHOLD 67 +#define INV_NEWTON_THRESHOLD 123 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 284 +#define REDC_1_TO_REDC_2_THRESHOLD 18 +#define REDC_2_TO_REDC_N_THRESHOLD 109 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 50 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 1,23,121,579,642 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 1562 +#define SET_STR_PRECOMPUTE_THRESHOLD 3100 + +#define FAC_DSC_THRESHOLD 774 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD2_DIV1_METHOD 5 /* 3.27% faster than 3 */ +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 150 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 4 /* 27.64% faster than 1 */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h new file mode 100644 index 0000000..09348e0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h @@ -0,0 +1,171 @@ +/* POWER8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4150 MHz POWER8/SMT4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-24, gcc 7.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 0 +/* From gcc112.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 13.00% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 34 + +#define DIV_1_VS_MUL_1_PERCENT 276 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 278 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159,11}, { 95, 8}, { 767, 7}, \ + { 1599,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 80 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79, 9}, { 319,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 71 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 114 +#define SQRLO_SQR_THRESHOLD 6461 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 158 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 112 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_2_THRESHOLD 56 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 46 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 3,19,117,672,1867 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 608 +#define SET_STR_PRECOMPUTE_THRESHOLD 2405 + +#define FAC_DSC_THRESHOLD 164 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 1 /* 6.88% faster than 3 */ +#define HGCD_THRESHOLD 114 +#define HGCD_APPR_THRESHOLD 118 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 0.74% faster than 4 */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm new file mode 100644 index 0000000..53ea0e0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm @@ -0,0 +1,53 @@ +dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb (approximate) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 ? +C POWER8 32 + +C This runs on POWER7 and later, but is faster only on later CPUs. +C We might want to inline this, considering its small footprint. + +ASM_START() +PROLOGUE(mpn_invert_limb) + sldi. r4, r3, 1 + neg r5, r3 + divdeu( r3, r5, r3) + beq- L(1) + blr +L(1): li r3, -1 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm new file mode 100644 index 0000000..2426a00 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm @@ -0,0 +1,112 @@ +dnl PowerPC-64 mpn_add_n_sub_n optimised for POWER9. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.25 + + +C INPUT PARAMETERS +define(`arp', `r3') +define(`srp', `r4') +define(`up', `r5') +define(`vp', `r6') +define(`n', `r7') + +ASM_START() +PROLOGUE(mpn_add_n_sub_n) + cmpdi cr7, n, 2 + subfo r0, r0, r0 C clear OV + rldicl. r9, n, 0, 63 C n & 1 + beq cr0, L(bx0) + +L(bx1): ld r10, 0(up) + ld r11, 0(vp) + ble cr7, L(1) + srdi r7, r7, 1 + mtctr r7 + ld r8, 8(up) + ld r9, 8(vp) + addex( r0, r10, r11, 0) + subfc r12, r11, r10 + addi up, up, -8 + addi vp, vp, -8 + b L(lo1) + +L(bx0): ld r8, 0(up) + ld r9, 0(vp) + ld r10, 8(up) + ld r11, 8(vp) + addex( r0, r8, r9, 0) + subfc r12, r9, r8 + addi arp, arp, 8 + addi srp, srp, 8 + ble cr7, L(end) + addi r7, r7, -1 + srdi r7, r7, 1 + mtctr r7 + +L(top): ld r8, 16(up) + ld r9, 16(vp) + std r0, -8(arp) + std r12, -8(srp) + addex( r0, r10, r11, 0) + subfe r12, r11, r10 +L(lo1): ld r10, 24(up) + ld r11, 24(vp) + std r0, 0(arp) + std r12, 0(srp) + addex( r0, r8, r9, 0) + subfe r12, r9, r8 + addi up, up, 16 + addi vp, vp, 16 + addi arp, arp, 16 + addi srp, srp, 16 + bdnz L(top) + +L(end): std r0, -8(arp) + std r12, -8(srp) +L(1): addex( r0, r10, r11, 0) + subfe r12, r11, r10 + std r0, 0(arp) + std r12, 0(srp) + subfe r3, r3, r3 + addex( r3, r3, r3, 0) + rldicl r3, r3, 1, 62 + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm new file mode 100644 index 0000000..95b8faa --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm @@ -0,0 +1,106 @@ +dnl Power9 mpn_addaddmul_1msb0 + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 1-way 2-way 4-way 8-way 16-way mul_1+addmul_1 +C power9: 4.55 3.87 3.55 3.35 3.25 5.16 + +C TODO +C * Only WAYS = 4 currently has proper feed-in code. +C * Try ldu/stdu to save the explicit updates. +C * Try using madd in a long dependent chain, only breaking the recurrency +C once per iteration. +C * Some cycles could perhaps be saved by scheduling the crX-setting insns. + +define(`rp', r3) +define(`ap', r4) +define(`bp', r5) +define(`n', r6) +define(`u0', r7) +define(`v0', r8) + +define(`BLOCK',` +L(lo`'eval((WAYS-$1)%4)): + ld r10, eval(8*$1)(ap) + ld r11, eval(8*$1)(bp) + mulld r12, r10, u0 + mulhdu r10, r10, u0 + maddld( r6, r11, v0, r12) + maddhdu(r11, r11, v0, r12) + adde r12, r6, r0 + std r12, eval(8*$1)(rp) + add r0, r10, r11') + +ifdef(`WAYS',,`define(`WAYS',4)') + +PROLOGUE(mpn_addaddmul_1msb0) + addi r10, n, WAYS-1 + srdi r10, r10, m4_log2(WAYS) + mtctr r10 + addic r0, r3, 0 + li r0, 0 +ifelse(WAYS,4,` + rldicl. r9, n, 0, 63 + rldicl r10, n, 63, 63 + cmpdi cr7, r10, 0 + bne cr0, L(bx1) + +L(bx0): beq cr7, L(lo0) + +L(b10): addi ap, ap, -16 + addi bp, bp, -16 + addi rp, rp, -16 + b L(lo2) + +L(bx1): bne cr7, L(b11) + +L(b01): addi ap, ap, -24 + addi bp, bp, -24 + addi rp, rp, -24 + b L(lo1) + +L(b11): addi ap, ap, -8 + addi bp, bp, -8 + addi rp, rp, -8 + b L(lo3) +') + +L(top): forloop(i,0,eval(WAYS-1),`BLOCK(i)') + + addi ap, ap, eval(8*WAYS) + addi bp, bp, eval(8*WAYS) + addi rp, rp, eval(8*WAYS) + bdnz L(top) + + addze r3, r0 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm new file mode 100644 index 0000000..8f49606 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm @@ -0,0 +1,130 @@ +dnl Power9 mpn_addmul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.5 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../aorsmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r0, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r10, 0(rp) + ld r12, 0(up) + ld r11, 8(rp) + ld r0, 8(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + ble cr6, L(2) + ld r10, 16(rp) + ld r12, 16(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r11, 0(rp) + ld r0, 0(up) + ble cr6, L(1) + ld r10, 8(rp) + ld r12, 8(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + ld r11, 16(rp) + ld r0, 16(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r10, 24(rp) + ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + maddld( r8, r0, v0, r11) C W:0,2,4 + maddhdu(r5, r0, v0, r11) C W:1,3,5 +L(mid): ld r11, 32(rp) + ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + maddld( r9, r12, v0, r10) C W:1,3,5 + maddhdu(r7, r12, v0, r10) C W:2,4,6 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + maddld( r8, r0, v0, r11) + adde r9, r5, r9 + maddhdu(r5, r0, v0, r11) + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r11) + std r8, 0(rp) + maddhdu(r3, r0, v0, r11) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm new file mode 100644 index 0000000..846a894 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm @@ -0,0 +1,193 @@ +dnl Power9 mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.62 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, r1, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r28, 0(rp) + ld r8, 0(up) + ld r11, 8(rp) + ld r9, 8(up) + maddld( r26, r8, v0, r28) + maddhdu(r31, r8, v0, r28) + blt cr7, L(2) + ld r28, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r29, 0(rp) + ld r9, 0(up) + ld r10, 8(rp) + ld r8, 8(up) + maddld( r27, r9, v0, r29) + maddhdu(r30, r9, v0, r29) + ld r29, 16(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + ld r28, 24(rp) + std r0, 0(rp) + maddld( r5, r8, v1, r29) C 1 5 -> addex + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r27, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r26, r0 C 8 12 + ld r29, 32(rp) + std r0, 8(rp) + maddld( r12, r9, v1, r28) C 2 6 -> addex + maddhdu(r11, r9, v1, r28) C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r26, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r27, r0 C 5 9 13 + ld r28, 40(rp) + std r0, 16(rp) + maddld( r5, r8, v1, r29) C 3 7 -> addex + maddhdu(r10, r8, v1, r29) C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r27, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r26, r0 C 6 10 + ld r29, 48(rp) + std r0, 24(rp) + maddld( r12, r9, v1, r28) C 4 8 -> addex + maddhdu(r11, r9, v1, r28) C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + std r0, 0(rp) C -4 + maddld( r5, r8, v1, r29) C 1 5 + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r26, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r27, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm new file mode 100644 index 0000000..e4ca3a8 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm @@ -0,0 +1,179 @@ +dnl POWER9 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 - - +C POWER4/PPC970 - - +C POWER5 - - +C POWER6 - - +C POWER7 - - +C POWER8 - - +C POWER9 2.63 2.63 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUBC', adde) + define(`ADDSUB', addc) + define(`func', mpn_addmul_1) + define(`AM', `$1') + define(`SM', `') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUBC', subfe) + define(`ADDSUB', subfc) + define(`func', mpn_submul_1) + define(`AM', `') + define(`SM', `$1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + cmpdi cr7, n, 3 + srdi r10, n, 2 + mtctr r10 + rldicl. r9, n, 0, 63 + ld r11, 0(up) + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 +AM(` subfzeo r12, n ') C ov = 0, ca = 0 +AM(` li r12, 0 ') +SM(` subfco r12, r12, r12 ') C r12 = 0, ov = 0, ca = 1 + ld r9, 8(up) + mulld r0, r11, v0 + mulhdu r5, r11, v0 + blt cr7, L(2) + ld r8, 16(up) + bne cr0, L(b10) + +L(b00): addi rp, rp, -24 + b L(lo0) +L(b10): addi rp, rp, -8 + addi up, up, 16 + b L(lo2) + +L(2): addi rp, rp, -8 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 +AM(` subfzeo r5, n ') C ov = 0, ca = 0 +AM(` li r5, 0 ') +SM(` subfco r5, r5, r5 ') C r5 = 0, ov = 0, ca = 1 + blt cr7, L(1) + ld r8, 8(up) + mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r9, 16(up) + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + addi up, up, 8 + b L(lo1) + +L(1): mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r11, 0(rp) + ADDSUB r10, r7, r11 + std r10, 0(rp) +AM(` addze r3, r12 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r12, r0 ') + blr + +L(b11): addi up, up, 24 + ble cr7, L(end) + + ALIGN(16) +L(top): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 0(up) + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(lo2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 8(up) + ADDSUBC r10, r0, r11 + std r10, 8(rp) +L(lo1): ld r11, 16(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 16(up) + ADDSUBC r10, r7, r11 + std r10, 16(rp) +L(lo0): ld r11, 24(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 24(up) + ADDSUBC r10, r0, r11 + std r10, 24(rp) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(cj2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ADDSUBC r10, r0, r11 + std r10, 8(rp) + ld r11, 16(rp) + addex( r7, r7, r5, 0) + ADDSUBC r10, r7, r11 + std r10, 16(rp) + li r0, 0 + addex( r3, r12, r0, 0) +AM(` addze r3, r3 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r3, r0 ') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm new file mode 100644 index 0000000..2dc982d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm @@ -0,0 +1,64 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 5.75 +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + b L(odd) + + ALIGN(16) +L(top): isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |v - u| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm new file mode 100644 index 0000000..12d11b0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm @@ -0,0 +1,143 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER9. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 9.58 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) + cmpld cr7, v0, u0 +L(top): subfc t0, v0, u0 C 0 12 + beq cr7, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): cnttzd cnt, t0 C 2 + subfic tmp, cnt, 64 C 4 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel u0, t0, s0, 2 C 6 + isel v1, v1, u1, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + cmpld cr7, v0, u0 + bne L(top) + + + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h new file mode 100644 index 0000000..f29a84e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h @@ -0,0 +1,254 @@ +/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200MHz POWER9 */ +/* FFT tuning limit = 221,245,838 */ +/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 0 +/* From gcc120.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 6.48% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 7 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 365 + +#define MUL_TOOM22_THRESHOLD 34 +#define MUL_TOOM33_THRESHOLD 109 +#define MUL_TOOM44_THRESHOLD 458 +#define MUL_TOOM6H_THRESHOLD 517 +#define MUL_TOOM8H_THRESHOLD 608 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 292 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 204 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 158 +#define SQR_TOOM4_THRESHOLD 674 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 898 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 35, 8}, { 71, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,11}, { 863,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1471,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,13}, { 895,11}, \ + { 3583,12}, { 1919,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 243 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \ + { 95,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5119,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 230 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 7246 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 40 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 88 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 62 + +#define INV_MULMOD_BNM1_THRESHOLD 79 +#define INV_NEWTON_THRESHOLD 11 +#define INV_APPR_THRESHOLD 11 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 8 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1470 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,16,151,839 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 15 +#define SET_STR_DC_THRESHOLD 406 +#define SET_STR_PRECOMPUTE_THRESHOLD 885 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 53 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 9.10% faster than 3 */ +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 321 +#define GCDEXT_DC_THRESHOLD 258 +#define JACOBI_BASE_METHOD 4 /* 15.45% faster than 1 */ + +/* Tuneup completed successfully, took 179422 seconds */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm new file mode 100644 index 0000000..363f095 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm @@ -0,0 +1,126 @@ +dnl Power9 mpn_mul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 ? +C POWER8 ? +C POWER9 2.47 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../mul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_mul_1c) + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + li r7, 0 +L(ent): ld r11, 0(up) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r12, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r0, 8(up) + maddld( r9, r11, v0, r7) + maddhdu(r7, r11, v0, r7) + ble cr6, L(2) + ld r12, 16(up) + mulld r8, r0, v0 + mulhdu r5, r0, v0 + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r0, 0(up) + ble cr6, L(1) + ld r12, 8(up) + maddld( r8, r11, v0, r7) + maddhdu(r5, r11, v0, r7) + ld r0, 16(up) + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r0, v0 + mulhdu r5, r0, v0 +L(mid): ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + mulld r8, r0, v0 + adde r9, r5, r9 + mulhdu r5, r0, v0 + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): mulld r8, r0, v0 + mulhdu r5, r0, v0 + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r7) + std r8, 0(rp) + maddhdu(r3, r0, v0, r7) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm new file mode 100644 index 0000000..01b50a3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm @@ -0,0 +1,181 @@ +dnl Power9 mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.58 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_mul_2) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, n, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + blt cr7, L(2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) + mulld r5, r8, v1 C 1 5 -> addex + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r28, r0 C 8 12 + std r0, 8(rp) + mulld r12, r9, v1 C 2 6 -> addex + mulhdu r11, r9, v1 C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r29, r0 C 5 9 13 + std r0, 16(rp) + mulld r5, r8, v1 C 3 7 -> addex + mulhdu r10, r8, v1 C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r28, r0 C 6 10 + std r0, 24(rp) + mulld r12, r9, v1 C 4 8 -> addex + mulhdu r11, r9, v1 C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) C -4 + mulld r5, r8, v1 C 1 5 + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r28, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r29, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm new file mode 100644 index 0000000..8f3d322 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm @@ -0,0 +1,415 @@ +dnl Power9 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Check if (inner) loop alignment affects performance. +C * Could we schedule loads less in addmul_2/mul_2? That would save some regs +C and make the tail code more manageable. +C * Postpone some register saves to main loop. +C * Perhaps write more small operands (3x1, 3x2, 3x3) code. +C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2. +C On the other hand, the current rp,up restore register are useful for OSP. +C * Do OSP. This should save a lot with the current deep addmul_2 pipeline. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r0') +define(`v1', `r7') +define(`rp2', `r24') +define(`up2', `r25') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + mr rp2, r3 C rp + mr up2, r4 C up + srdi r22, r5, 2 C un + subfic r23, r7, 0 C -vn, clear CA + subfo r0, r0, r0 C clear OV (and r0) + + cmpdi cr6, un, 3 + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + rldicl r0, un, 63, 63 C FIXME: unused for vn = 1 + cmpdi cr5, r0, 0 C FIXME: unused for vn = 1 + + ld v0, 0(vp) + rldicl. r9, vn, 0, 63 + beq cr0, L(vn_evn) + +L(vn_odd): + addi r10, un, -2 + ld r5, 0(up) + srdi r10, r10, 1 + mtctr r10 + bne cr7, L(m1_b1) + +L(m1_b0): + ld r10, 8(up) + mulld r9, r5, v0 + mulhdu r11, r5, v0 + ld r12, 16(up) + mulld r8, r10, v0 + mulhdu r5, r10, v0 + addi rp, rp, -8 + b L(m1_mid) + +L(m1_b1): + ld r12, 8(up) + mulld r8, r5, v0 + mulhdu r5, r5, v0 + ld r10, 16(up) + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi up, up, 8 + beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3} + + ALIGN(16) +L(m1_top): + ld r12, 16(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r10, v0 + mulhdu r5, r10, v0 +L(m1_mid): + ld r10, 24(up) + std r9, 8(rp) + adde r8, r11, r8 + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(m1_top) + +L(m1_end): + std r8, 0(rp) + mulld r8, r10, v0 + adde r9, r5, r9 + mulhdu r5, r10, v0 + std r9, 8(rp) + adde r8, r11, r8 + std r8, 16(rp) + addze r10, r5 + std r10, 24(rp) + + addi rp2, rp2, 8 + addi vp, vp, 8 + addic. r23, r23, 1 + b L(do_outer) + +L(vn_evn): + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(m2_bx1) + +L(m2_bx0): + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(m2_b10) + +L(m2_b00): + addi up, up, -8 + addi rp, rp, -24 + b L(m2_lo0) + +L(m2_b10): + addi up, up, 8 + addi rp, rp, -8 + b L(m2_lo2) + +L(m2_bx1): + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(m2_b11) + +L(m2_b01): + addi rp, rp, -16 + b L(m2_lo1) +L(m2_b11): + addi up, up, 16 + beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done. + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj) + +L(outer): + ld v0, 0(vp) + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(bx1) + +L(bx0): ld r26, 0(rp2) + ld r8, 0(up2) + ld r11, 8(rp2) + ld r9, 8(up2) + maddld( r28, r8, v0, r26) + maddhdu(r31, r8, v0, r26) + ld r26, 16(rp2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(b10) + +L(b00): addi up, up2, -8 + addi rp, rp2, -24 + b L(lo0) + +L(b10): addi up, up2, 8 + addi rp, rp2, -8 + b L(lo2) + +L(bx1): ld r27, 0(rp2) + ld r9, 0(up2) + ld r10, 8(rp2) + ld r8, 8(up2) + maddld( r29, r9, v0, r27) + maddhdu(r30, r9, v0, r27) + ld r27, 16(rp2) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(b11) + +L(b01): addi up, up2, 0 + addi rp, rp2, -16 + b L(lo1) +L(b11): addi up, up2, 16 + addi rp, rp2, 0 + beq cr6, L(end) C taken means un = 3, vn = 3. We're done. + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + + cmpdi cr0, r23, 0 + addi rp2, rp2, 16 + addi vp, vp, 16 +L(do_outer): + bne cr0, L(outer) +L(ret): + ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm new file mode 100644 index 0000000..2d4fa63 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm @@ -0,0 +1,555 @@ +dnl Power9 mpn_sqr_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Completely separate evn and odd code into two outer loops. Also consider +C unrolling these two outer loops and thereby eliminate all branches. +C * Avoid the reloading of u1 before every loop start. +C * Reduce register usage. +C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde. +C * Consider skewing conditional adjustments to allow mask creation with subfe +C like in the un=3 code. It might streamline the adjustments (or not). + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') + +define(`u0', `r0') +define(`u1', `r7') +define(`rp2', `r24') +define(`up2', `r25') +define(`cy', `r6') + +define(`LSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1U',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1UF',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 +') +define(`LSHU1UHF',` + add u0, u0, u0 + add u0, u0, cy +') +C These are cleverer replacements, but they tend to leave CA set, disturbing +C the main accumulation code! Breaking that false dependency might have a +C positive performance impact. Note that the subfe here results in a mask for +C our adjustments. +define(`xLSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 +') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + ld r0, 0(up) C n = 1 + mulld r8, r0, r0 C weight 0 + mulhdu r9, r0, r0 C weight 1 + std r8, 0(rp) + cmpdi cr0, un, 2 + bge cr0, L(ge2) + std r9, 8(rp) + blr + +L(ge2): bgt cr0, L(gt2) + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + +L(gt2): cmpdi cr0, un, 3 + bgt cr0, L(gt3) + std r30, -16(r1) + std r31, -8(r1) + subfo r12, r12, r12 C clear OV (and result register) + ld r8, 8(r4) + mulld r5, r8, r8 C W2 + mulhdu r10, r8, r8 C W3 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, r8 C W3 + addc u0, u0, u0 + adde u1, r8, r8 + subfe r6, r6, r6 C mask + ld r4, 16(r4) C W2 + mulld r12, r8, u0 C W1 u1 x u0 + mulhdu r8, r8, u0 C W2 u1 x u0 + maddld( r31, r4, u0, r11) C W2 + maddhdu(r30, r4, u0, r11) C W3 + andc r6, r4, r6 C W4 + addc r9, r12, r9 C W1 + std r9, 8(rp) C W1 + mulld r9, r4, u1 C W3 + mulhdu r11, r4, u1 C W4 + addex( r5, r5, r8, 0) C W2 + adde r5, r31, r5 C W2 + std r5, 16(rp) C W2 + maddld( r5, r4, r4, r6) C W4 u2^2 + maddhdu(r6, r4, r4, r6) C W5 u2^2 + addex( r9, r9, r30, 0) C W3 + adde r9, r9, r10 C W3 + std r9, 24(rp) C W3 + adde r5, r5, r11 C W4 + addze r6, r6 C W5 + li r8, 0 + addex( r5, r5, r8, 0) C W4 + std r5, 32(rp) C W4 + addex( r6, r6, r8, 0) C W5 + std r6, 40(rp) C W5 + ld r30, -16(r1) + ld r31, -8(r1) + blr + +L(gt3): std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + mr rp2, rp + mr up2, up + addi r22, un, -1 C count for loop FIXME: Adjust + subfo r0, r0, r0 C clear OV (and r0) + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + beq cr7, L(m2_evn) +L(m2_odd): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r12, u0, u0 + mulld r5, u1, u1 + mulhdu r10, u1, u1 + + sradi r11, u0, 63 + and r11, r11, u1 + + LSHU1U0 + + ld r8, 8(up2) + ld r9, 16(up2) + mulld r28, r8, u0 C W u1 x u0 + mulhdu r31, r8, u0 C W u1 x u0 + std r23, 0(rp2) + + bne cr0, L(m2_11) +L(m2_01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(m2_lo2) +L(m2_11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(m2_lo0) + +L(m2_evn): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r5, u0, u0 + mulld r12, u1, u1 + mulhdu r11, u1, u1 + + sradi r10, u0, 63 + and r10, r10, u1 + + LSHU1U0 + + ld r9, 8(up2) + ld r8, 16(up2) + mulld r29, r9, u0 C W u1 x u0 + mulhdu r30, r9, u0 C W u1 x u0 + std r23, 0(rp2) + + beq cr0, L(m2_10) +L(m2_00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(m2_lo1) +L(m2_10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(m2_end) + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + b L(cj) C jump to addmul_2 tail + +L(outer): + addi up2, up2, 16 + addi rp2, rp2, 32 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + ld r26, 0(rp2) + ld r27, 16(rp2) + + rldicl. r31, r22, 63, 63 C r22 & 2 + beq cr7, L(evn) + +L(odd): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 8(rp2) + + ld r8, -8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1U + + addc r23, r23, r8 + + ld r8, 8(up2) + ld r9, 16(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + ld r26, 24(rp2) + std r23, 0(rp2) C W0 + + bne cr0, L(11) +L(01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(lo2) +L(11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(lo0) + +L(evn): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + maddld( r12, u1, u1, r27) C W u3^2 + maddhdu(r11, u1, u1, r27) C W u3^2 + ld r27, 8(rp2) + + ld r9, -8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1U + + addc r23, r23, r9 + + ld r9, 8(up2) + ld r8, 16(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + ld r27, 24(rp2) + std r23, 0(rp2) C W0 + + beq cr0, L(10) +L(00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(lo1) +L(10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + bgt cr5, L(outer) + +L(corner): + ld u0, 16(up2) + ld u1, 24(up2) + ld r26, 32(rp2) + bne cr7, L(corner_odd) + +L(corner_evn): + ld r27, 40(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + mulld r12, u1, u1 C W u3^2 + mulhdu r11, u1, u1 C W u3^2 + + ld r9, 8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1UHF + + addc r23, r23, r9 + + ld r9, 24(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + std r23, 32(rp2) + adde r5, r29, r5 + std r5, 40(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 C W FIXME can this co? + std r12, 48(rp2) + li r4, 0 + addex( r5, r11, r4, 0) + addze r5, r5 + std r5, 56(rp2) + b L(ret) + +L(corner_odd): + ld r27, 48(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 40(rp2) + + ld r8, 8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1UF + + addc r23, r23, r8 + + ld r8, 24(up2) + ld r9, 32(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + std r23, 32(rp2) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 40(rp2) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 48(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 56(rp2) + mulld r23, r9, r9 C W u2^2 + mulhdu r12, r9, r9 C W u2^2 + adde r23, r23, r11 + addze r12, r12 + sradi r4, r8, 63 C CAUTION: clobbers CA + and r4, r4, r9 + addex( r23, r23, r4, 0) + std r23, 64(rp2) + li r4, 0 + addex( r12, r12, r4, 0) + std r12, 72(rp2) + +L(ret): ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm new file mode 100644 index 0000000..1f57bdf --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm @@ -0,0 +1,173 @@ +dnl PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n + +dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.9 +C POWER5 ? +C POWER6 3.5 +C POWER7 2.25 + +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUBC', `addc') + define(`ADDSUBE', `adde') + define(INITCY, `addic $1, r1, 0') + define(`func', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUBC', `subfc') + define(`ADDSUBE', `subfe') + define(INITCY, `addic $1, r1, -1') + define(`func', mpn_rsh1sub_n)') + +define(`s0', `r9') +define(`s1', `r7') +define(`x0', `r0') +define(`x1', `r12') +define(`u0', `r8') +define(`v0', `r10') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + ld u0, 0(up) + ld v0, 0(vp) + + cmpdi cr6, n, 2 + + addi r0, n, 1 + srdi r0, r0, 2 + mtctr r0 C copy size to count register + + andi. r0, n, 1 + bne cr0, L(bx1) + +L(bx0): ADDSUBC x1, v0, u0 + ld u0, 8(up) + ld v0, 8(vp) + ADDSUBE x0, v0, u0 + ble cr6, L(n2) + ld u0, 16(up) + ld v0, 16(vp) + srdi s0, x1, 1 + rldicl r11, x1, 0, 63 C return value + ADDSUBE x1, v0, u0 + andi. n, n, 2 + bne cr0, L(b10) +L(b00): addi rp, rp, -24 + b L(lo0) +L(b10): addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, -8 + b L(lo2) + + ALIGN(16) +L(bx1): ADDSUBC x0, v0, u0 + ble cr6, L(n1) + ld u0, 8(up) + ld v0, 8(vp) + ADDSUBE x1, v0, u0 + ld u0, 16(up) + ld v0, 16(vp) + srdi s1, x0, 1 + rldicl r11, x0, 0, 63 C return value + ADDSUBE x0, v0, u0 + andi. n, n, 2 + bne cr0, L(b11) +L(b01): addi up, up, 8 + addi vp, vp, 8 + addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 24 + addi vp, vp, 24 + bdz L(end) + + ALIGN(32) +L(top): ld u0, 0(up) + ld v0, 0(vp) + srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) + ADDSUBE x1, v0, u0 +L(lo2): ld u0, 8(up) + ld v0, 8(vp) + srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 8(rp) + ADDSUBE x0, v0, u0 +L(lo1): ld u0, 16(up) + ld v0, 16(vp) + srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 16(rp) + ADDSUBE x1, v0, u0 +L(lo0): ld u0, 24(up) + ld v0, 24(vp) + srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 24(rp) + ADDSUBE x0, v0, u0 + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) +L(cj2): srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 8(rp) +L(cj1): ADDSUBE x1, x1, x1 C pseudo-depends on x1 + rldimi s1, x1, 63, 0 + std s1, 16(rp) + mr r3, r11 + blr + +L(n1): srdi s1, x0, 1 + rldicl r11, x0, 0, 63 C return value + ADDSUBE x1, x1, x1 C pseudo-depends on x1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) + mr r3, r11 + blr + +L(n2): addi rp, rp, -8 + srdi s0, x1, 1 + rldicl r11, x1, 0, 63 C return value + b L(cj2) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm new file mode 100644 index 0000000..e76bb88 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm @@ -0,0 +1,863 @@ +dnl PowerPC-64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 16.25 +C POWER7 3.77 + +C NOTES +C * This is very crude, cleanup! +C * Try to reduce the number of needed live registers. +C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The +C cost will be more live registers. +C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code +C size a lot and speed things up perhaps 25%. +C * Use computed goto in order to compress the code. +C * Implement a larger final corner. +C * Schedule callee-saves register saves into other insns. This could save +C about 5 cycles/call. (We cannot analogously optimise the restores, since +C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) +C * Should the alternating std/adde sequences be split? Some pipelines handle +C adde poorly, and might sequentialise all these instructions. +C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for +C adjacent integer multiply insns. Except for the multiply insns, the code +C was not carefully optimised for POWER6 or any other CPU. +C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`rp_outer', `r25') +define(`up_outer', `r21') +define(`rp_saved', `r22') +define(`up_saved', `r23') +define(`n_saved', `r24') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + cmpdi cr0, n, 2 + bge cr0, L(ge2) + ld r5, 0(up) C n = 1 + nop + mulld r8, r5, r5 C weight 0 + mulhdu r9, r5, r5 C weight 1 + std r8, 0(rp) + std r9, 8(rp) + blr + ALIGN(16) +L(ge2): bgt cr0, L(gt2) + ld r0, 0(up) C n = 2 + nop + mulld r8, r0, r0 C u0 * u0 + mulhdu r9, r0, r0 C u0 * u0 + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r8, 0(rp) + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + + ALIGN(16) +L(gt2): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + + mr rp_saved, rp + mr up_saved, up + mr n_saved, n + mr rp_outer, rp + mr up_outer, up + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic r7, n, 2 C compute count... + srdi r7, r7, 2 C ...for ctr + mtctr r7 C copy count into ctr + beq- cr0, L(b0) + blt- cr6, L(b1) + beq- cr6, L(b2) + +L(b3): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 + li r12, 0 C carry limb + bdz L(em3) + + ALIGN(16) +L(tm3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm3) + +L(em3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop) + +L(b0): ld r6, 0(up) + ld r27, 8(up) + mulld r7, r27, r6 + mulhdu r12, r27, r6 + std r7, 8(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(em0) + + ALIGN(16) +L(tm0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm0) + +L(em0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_2) + +L(b1): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r12, r27, r6 + addc r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(em1) + + ALIGN(16) +L(tm1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm1) + +L(em1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_3) + +L(b2): addi r7, r7, -1 C FIXME + mtctr r7 C FIXME + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + std r0, 8(rp) + std r7, 16(rp) + std r11, 24(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(em2) + + ALIGN(16) +L(tm2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm2) + +L(em2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_0) + + +L(outer_loop): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + bdz L(outer_end) + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + adde r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(ea1) + + ALIGN(16) +L(ta1): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta1) + +L(ea1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_0): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r28 + adde r7, r7, r26 + addze r12, r8 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(ea0) + + ALIGN(16) +L(ta0): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta0) + +L(ea0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_3): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + addc r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(ea3) + + ALIGN(16) +L(ta3): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta3) + +L(ea3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + +L(outer_loop_ent_2): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + bdz L(ea2) + addi up, up, 24 + + ALIGN(16) +L(ta2): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta2) + +L(ea2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + b L(outer_loop) + +L(outer_end): + ld r6, 0(up) + ld r9, 8(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + addc r0, r0, r11 + std r0, 0(rp) + addze r8, r8 + std r8, 8(rp) + +define(`rp', `rp_saved') +define(`up', `r5') +define(`n', `r6') +define(`climb', `r0') + + addi r4, rp_saved, 8 + mr r5, up_saved + mr r6, n_saved + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 2 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C put loop count into ctr + beq cr0, L(xb0) + blt cr6, L(xb1) + beq cr6, L(xb2) + +L(xb3): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + addi up, up, 24 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + addze climb, r29 + addc r10, r10, r25 + adde r11, r11, r26 + adde r6, r6, r27 + adde r7, r7, r28 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + std r6, 24(rp) + std r7, 32(rp) + addi rp, rp, 40 + bdnz L(top) + b L(end) + +L(xb2): ld r6, 0(up) + ld r7, 8(up) + addi up, up, 16 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + ld r10, 8(rp) + ld r11, 16(rp) + addc r10, r10, r10 + adde r11, r11, r11 + addze climb, r27 + addc r10, r10, r25 + adde r11, r11, r26 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + addi rp, rp, 24 + bdnz L(top) + b L(end) + +L(xb0): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + ld r12, 40(rp) + ld r23, 48(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + adde r12, r12, r12 + adde r23, r23, r23 + addze climb, r31 + std r24, 0(rp) + addc r10, r10, r25 + std r10, 8(rp) + adde r11, r11, r26 + std r11, 16(rp) + adde r6, r6, r27 + std r6, 24(rp) + adde r7, r7, r28 + std r7, 32(rp) + adde r12, r12, r29 + std r12, 40(rp) + adde r23, r23, r30 + std r23, 48(rp) + addi rp, rp, 56 + bdnz L(top) + b L(end) + +L(xb1): ld r6, 0(up) + addi up, up, 8 + mulld r24, r6, r6 + mulhdu climb, r6, r6 + std r24, 0(rp) + addic rp, rp, 8 C clear carry as side-effect + + ALIGN(32) +L(top): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r8, 0(rp) + ld r9, 8(rp) + adde r8, r8, r8 + adde r9, r9, r9 + ld r10, 16(rp) + ld r11, 24(rp) + adde r10, r10, r10 + adde r11, r11, r11 + ld r6, 32(rp) + ld r7, 40(rp) + adde r6, r6, r6 + adde r7, r7, r7 + ld r12, 48(rp) + ld r23, 56(rp) + adde r12, r12, r12 + adde r23, r23, r23 + addze r31, r31 + addc r8, r8, climb + std r8, 0(rp) + adde r9, r9, r24 + std r9, 8(rp) + adde r10, r10, r25 + std r10, 16(rp) + adde r11, r11, r26 + std r11, 24(rp) + adde r6, r6, r27 + std r6, 32(rp) + adde r7, r7, r28 + std r7, 40(rp) + adde r12, r12, r29 + std r12, 48(rp) + adde r23, r23, r30 + std r23, 56(rp) + mr climb, r31 + addi rp, rp, 64 + bdnz L(top) + +L(end): addze climb, climb + std climb, 0(rp) + + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + blr +EPILOGUE() -- cgit v1.2.3