From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/powerpc32/750/com.asm | 79 ++++++ gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h | 192 +++++++++++++ gmp-6.3.0/mpn/powerpc32/750/lshift.asm | 155 +++++++++++ gmp-6.3.0/mpn/powerpc32/750/rshift.asm | 153 +++++++++++ gmp-6.3.0/mpn/powerpc32/README | 180 +++++++++++++ gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm | 100 +++++++ gmp-6.3.0/mpn/powerpc32/addmul_1.asm | 159 +++++++++++ gmp-6.3.0/mpn/powerpc32/aix.m4 | 82 ++++++ gmp-6.3.0/mpn/powerpc32/aors_n.asm | 157 +++++++++++ gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm | 131 +++++++++ gmp-6.3.0/mpn/powerpc32/darwin.m4 | 91 +++++++ gmp-6.3.0/mpn/powerpc32/diveby3.asm | 93 +++++++ gmp-6.3.0/mpn/powerpc32/divrem_2.asm | 182 +++++++++++++ gmp-6.3.0/mpn/powerpc32/eabi.m4 | 86 ++++++ gmp-6.3.0/mpn/powerpc32/elf.m4 | 100 +++++++ gmp-6.3.0/mpn/powerpc32/gmp-mparam.h | 222 +++++++++++++++ gmp-6.3.0/mpn/powerpc32/invert_limb.asm | 142 ++++++++++ gmp-6.3.0/mpn/powerpc32/lshift.asm | 168 ++++++++++++ gmp-6.3.0/mpn/powerpc32/lshiftc.asm | 170 ++++++++++++ gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm | 145 ++++++++++ gmp-6.3.0/mpn/powerpc32/mode1o.asm | 127 +++++++++ gmp-6.3.0/mpn/powerpc32/mul_1.asm | 101 +++++++ gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm | 187 +++++++++++++ gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h | 155 +++++++++++ gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h | 209 +++++++++++++++ gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h | 156 +++++++++++ gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h | 165 ++++++++++++ gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h | 170 ++++++++++++ gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 | 128 +++++++++ gmp-6.3.0/mpn/powerpc32/rshift.asm | 166 ++++++++++++ gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm | 143 ++++++++++ gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm | 80 ++++++ gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm | 101 +++++++ gmp-6.3.0/mpn/powerpc32/submul_1.asm | 151 +++++++++++ gmp-6.3.0/mpn/powerpc32/umul.asm | 50 ++++ gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm | 203 ++++++++++++++ gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm | 198 ++++++++++++++ gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm | 310 +++++++++++++++++++++ gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm | 388 +++++++++++++++++++++++++++ gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm | 34 +++ 40 files changed, 6009 insertions(+) create mode 100644 gmp-6.3.0/mpn/powerpc32/750/com.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/750/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/750/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/README create mode 100644 gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/aix.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/darwin.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/diveby3.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/eabi.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/elf.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mode1o.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/umul.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm (limited to 'gmp-6.3.0/mpn/powerpc32') diff --git a/gmp-6.3.0/mpn/powerpc32/750/com.asm b/gmp-6.3.0/mpn/powerpc32/750/com.asm new file mode 100644 index 0000000..1b8b574 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/com.asm @@ -0,0 +1,79 @@ +dnl PowerPC 750 mpn_com -- mpn bitwise one's complement + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 2.0 +C 7400,7410 (G4): 2.0 +C 744x,745x (G4+): 3.0 + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C This loop form is necessary for the claimed speed. + +ASM_START() +PROLOGUE(mpn_com) + + C r3 dst + C r4 src + C r5 size + + mtctr r5 C size + lwz r5, 0(r4) C src low limb + + sub r4, r4, r3 C src-dst + subi r3, r3, 4 C dst-4 + + addi r4, r4, 8 C src-dst+8 + bdz L(one) + +L(top): + C r3 &dst[i-1] + C r4 src-dst + C r5 src[i] + C r6 scratch + + not r6, r5 C ~src[i] + lwzx r5, r4,r3 C src[i+1] + + stwu r6, 4(r3) C dst[i] + bdnz L(top) + +L(one): + not r6, r5 + + stw r6, 4(r3) C dst[size-1] + blr + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h new file mode 100644 index 0000000..3667e85 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h @@ -0,0 +1,192 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2002, 2004, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* This file is used for 75x (G3) and for 7400/7410 (G4), both which have + much slow multiply instructions. */ + +/* 450 MHz PPC 7400 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 38 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 141 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 57 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 173 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \ + { 287, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351, 7}, { 703,10}, \ + { 95, 9}, { 191, 8}, { 383, 9}, { 207,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 415,10}, { 831,11}, { 447,10}, \ + { 895,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 575,12}, { 319,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 895,11}, \ + { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1407,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1151,12}, { 2303,13}, { 1407,14}, { 767,13}, \ + { 1919,10}, { 15359,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 154 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \ + { 287, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319,10}, { 175, 9}, { 351,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,10}, { 223,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 575,12}, \ + { 319,11}, { 703,10}, { 1407,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 895,10}, { 1791,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 575,11}, { 1215,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1535,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1919,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 152 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 31 +#define DC_DIVAPPR_Q_THRESHOLD 108 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 125 + +#define BINV_NEWTON_THRESHOLD 156 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 807 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 87 +#define GCD_DC_THRESHOLD 233 +#define GCDEXT_DC_THRESHOLD 198 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 390 +#define SET_STR_PRECOMPUTE_THRESHOLD 814 diff --git a/gmp-6.3.0/mpn/powerpc32/750/lshift.asm b/gmp-6.3.0/mpn/powerpc32/750/lshift.asm new file mode 100644 index 0000000..3a1c1a7 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/lshift.asm @@ -0,0 +1,155 @@ +dnl PowerPC 750 mpn_lshift -- mpn left shift. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 750: 3.0 +C 7400: 3.0 + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C This code is the same per-limb speed as mpn/powerpc32/lshift.asm, but +C smaller and saving about 30 or so cycles of overhead. + +ASM_START() +PROLOGUE(mpn_lshift) + + C r3 dst + C r4 src + C r5 size + C r6 shift + + mtctr r5 C size + slwi r5, r5, 2 C 4*size + + subfic r7, r6, 32 C 32-shift + add r4, r4, r5 C &src[size] + + add r5, r3, r5 C &dst[size] + lwz r8, -4(r4) C src[size-1] + bdz L(one) + + lwzu r9, -8(r4) C src[size-2] + + srw r3, r8, r7 C return value + slw r8, r8, r6 C src[size-1] << shift + bdz L(two) + + +L(top): + C r3 return value + C r4 src, incrementing + C r5 dst, incrementing + C r6 lshift + C r7 32-shift + C r8 src[i+1] << shift + C r9 src[i] + C r10 + + lwzu r10, -4(r4) + srw r11, r9, r7 + + or r8, r8, r11 + stwu r8, -4(r5) + + slw r8, r9, r6 + bdz L(odd) + + C r8 src[i+1] << shift + C r9 + C r10 src[i] + + lwzu r9, -4(r4) + srw r11, r10, r7 + + or r8, r8, r11 + stwu r8, -4(r5) + + slw r8, r10, r6 + bdnz L(top) + + +L(two): + C r3 return value + C r4 + C r5 &dst[2] + C r6 shift + C r7 32-shift + C r8 src[1] << shift + C r9 src[0] + C r10 + + srw r11, r9, r7 + slw r12, r9, r6 C src[0] << shift + + or r8, r8, r11 + stw r12, -8(r5) C dst[0] + + stw r8, -4(r5) C dst[1] + blr + + +L(odd): + C r3 return value + C r4 + C r5 &dst[2] + C r6 shift + C r7 32-shift + C r8 src[1] << shift + C r9 + C r10 src[0] + + srw r11, r10, r7 + slw r12, r10, r6 + + or r8, r8, r11 + stw r12, -8(r5) C dst[0] + + stw r8, -4(r5) C dst[1] + blr + + +L(one): + C r5 &dst[1] + C r6 shift + C r7 32-shift + C r8 src[0] + + srw r3, r8, r7 C return value + slw r8, r8, r6 C src[size-1] << shift + + stw r8, -4(r5) C dst[0] + blr + +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/powerpc32/750/rshift.asm b/gmp-6.3.0/mpn/powerpc32/750/rshift.asm new file mode 100644 index 0000000..4825fee --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/rshift.asm @@ -0,0 +1,153 @@ +dnl PowerPC 750 mpn_rshift -- mpn right shift. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 750: 3.0 +C 7400: 3.0 + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C This code is the same per-limb speed as mpn/powerpc32/rshift.asm, but +C smaller and saving about 30 or so cycles of overhead. + +ASM_START() +PROLOGUE(mpn_rshift) + + C r3 dst + C r4 src + C r5 size + C r6 shift + + mtctr r5 C size + lwz r8, 0(r4) C src[0] + + subfic r7, r6, 32 C 32-shift + addi r5, r3, -4 C dst-4 + + slw r3, r8, r7 C return value + bdz L(one) + + lwzu r9, 4(r4) C src[1] + srw r8, r8, r6 C src[0] >> shift + bdz L(two) + + +L(top): + C r3 return value + C r4 src, incrementing + C r5 dst, incrementing + C r6 shift + C r7 32-shift + C r8 src[i-1] >> shift + C r9 src[i] + C r10 + + lwzu r10, 4(r4) + slw r11, r9, r7 + + or r8, r8, r11 + stwu r8, 4(r5) + + srw r8, r9, r6 + bdz L(odd) + + C r8 src[i-1] >> shift + C r9 + C r10 src[i] + + lwzu r9, 4(r4) + slw r11, r10, r7 + + or r8, r8, r11 + stwu r8, 4(r5) + + srw r8, r10, r6 + bdnz L(top) + + +L(two): + C r3 return value + C r4 + C r5 &dst[size-2] + C r6 shift + C r7 32-shift + C r8 src[size-2] >> shift + C r9 src[size-1] + C r10 + + slw r11, r9, r7 + srw r12, r9, r6 C src[size-1] >> shift + + or r8, r8, r11 + stw r12, 8(r5) C dst[size-1] + + stw r8, 4(r5) C dst[size-2] + blr + + +L(odd): + C r3 return value + C r4 + C r5 &dst[size-2] + C r6 shift + C r7 32-shift + C r8 src[size-2] >> shift + C r9 + C r10 src[size-1] + + slw r11, r10, r7 + srw r12, r10, r6 + + or r8, r8, r11 + stw r12, 8(r5) C dst[size-1] + + stw r8, 4(r5) C dst[size-2] + blr + + +L(one): + C r3 return value + C r4 + C r5 dst-4 + C r6 shift + C r7 + C r8 src[0] + + srw r8, r8, r6 + + stw r8, 4(r5) C dst[0] + blr + +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/powerpc32/README b/gmp-6.3.0/mpn/powerpc32/README new file mode 100644 index 0000000..887e78b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/README @@ -0,0 +1,180 @@ +Copyright 2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + POWERPC 32-BIT MPN SUBROUTINES + + +This directory contains mpn functions for various 32-bit PowerPC chips. + + +CODE ORGANIZATION + + directory used for + ================================================ + powerpc generic, 604, 604e, 744x, 745x + powerpc/750 740, 750, 7400, 7410 + + +The top-level powerpc directory is currently mostly aimed at 604/604e but +should be reasonable on all powerpcs. + + + +STATUS + +The code is quite well optimized for the 604e, other chips have had less +attention. + +Altivec SIMD available in 74xx might hold some promise, but unfortunately +GMP only guarantees 32-bit data alignment, so there's lots of fiddling +around with partial operations at the start and end of limb vectors. A +128-bit limb would be a novel idea, but is unlikely to be practical, since +it would have to work with ordinary +, -, * etc in the C code. + +Also, Altivec isn't very well suited for the GMP multiplication needs. +Using floating-point based multiplication has much better better performance +potential for all current powerpcs, both the ones with slow integer multiply +units (603, 740, 750, 7400, 7410) and those with fast (604, 604e, 744x, +745x). This is because all powerpcs do some level of pipelining in the FPU: + +603 and 750 can sustain one fmadd every 2nd cycle. +604 and 604e can sustain one fmadd per cycle. +7400 and 7410 can sustain 3 fmadd in 4 cycles. +744x and 745x can sustain 4 fmadd in 5 cycles. + + + +REGISTER NAMES + +The normal powerpc convention is to give registers as plain numbers, like +"mtctr 6", but on Apple MacOS X (powerpc*-*-rhapsody* and +powerpc*-*-darwin*) the assembler demands an "r" like "mtctr r6". Note +however when register 0 in an instruction means a literal zero the "r" is +omitted, for instance "lwzx r6,0,r7". + +The GMP code uses the "r" forms, powerpc-defs.m4 transforms them to plain +numbers according to what GMP_ASM_POWERPC_R_REGISTERS finds is needed. +(Note that this style isn't fully general, as the identifier r4 and the +register r4 will not be distinguishable on some systems. However, this is +not a problem for the limited GMP assembly usage.) + + + +GLOBAL REFERENCES + +Linux non-PIC + lis 9, __gmp_binvert_limb_table@ha + rlwinm 11, 5, 31, 25, 31 + la 9, __gmp_binvert_limb_table@l(9) + lbzx 11, 9, 11 + +Linux PIC (FIXME) +.LCL0: + .long .LCTOC1-.LCF0 + bcl 20, 31, .LCF0 +.LCF0: + mflr 30 + lwz 7, .LCL0-.LCF0(30) + add 30, 7, 30 + lwz 11, .LC0-.LCTOC1(30) + rlwinm 3, 5, 31, 25, 31 + lbzx 7, 11, 3 + +AIX (always PIC) +LC..0: + .tc __gmp_binvert_limb_table[TC],__gmp_binvert_limb_table[RW] + lwz 9, LC..0(2) + rlwinm 0, 5, 31, 25, 31 + lbzx 0, 9, 0 + +Darwin (non-PIC) + lis r2, ha16(___gmp_binvert_limb_table) + rlwinm r9, r5, 31, 25, 31 + la r2, lo16(___gmp_binvert_limb_table)(r2) + lbzx r0, r2, r9 +Darwin (PIC) + mflr r0 + bcl 20, 31, L0001$pb +L0001$pb: + mflr r7 + mtlr r0 + addis r2, r7, ha16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb) + rlwinm r9, r5, 31, 25, 31 + lwz r2, lo16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb)(r2) + lbzx r0, r2, r9 +------ + .non_lazy_symbol_pointer +L___gmp_binvert_limb_table$non_lazy_ptr: + .indirect_symbol ___gmp_binvert_limb_table + .long 0 + .subsections_via_symbols + + +For GNU/Linux and Darwin, we might want to duplicate __gmp_binvert_limb_table +into the text section in this file. We should thus be able to reach it like +this: + + blr L0 +L0: mflr r2 + rlwinm r9, r5, 31, 25, 31 + addi r9, r9, lo16(local_binvert_table-L0) + lbzx r0, r2, r9 + + + +REFERENCES + +PowerPC Microprocessor Family: The Programming Environments for 32-bit +Microprocessors, IBM document G522-0290-01, 2000. + +PowerPC 604e RISC Microprocessor User's Manual with Supplement for PowerPC +604 Microprocessor, IBM document G552-0330-00, Freescale document +MPC604EUM/AD, 3/1998. + +MPC7410/MPC7400 RISC Microprocessor User's Manual, Freescale document +MPC7400UM/D, rev 1, 11/2002. + +MPC7450 RISC Microprocessor Family Reference Manual, Freescale document +MPC7450UM, rev 5, 1/2005. + +The above are available online from + + http://www.ibm.com/chips/techlib/techlib.nsf/productfamilies/PowerPC + http://www.freescale.com/PowerPC + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm b/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm new file mode 100644 index 0000000..71645c3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm @@ -0,0 +1,100 @@ +dnl PowerPC-32 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) + +dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 5.0 +C 7400,7410 (G4): 5.0 +C 744x,745x (G4+): 5.0 +C power4/ppc970: 4.25 +C power5: 5.0 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +define(`rp',`r3') +define(`up',`r4') +define(`vp',`r5') + +define(`s0',`r6') +define(`s1',`r7') +define(`u0',`r8') +define(`v0',`r10') +define(`v1',`r11') + +ASM_START() +PROLOGUE(mpn_addlsh1_n) + mtctr r6 C copy n in ctr + addic r31, r31, 0 C clear cy + + lwz v0, 0(vp) C load v limb + lwz u0, 0(up) C load u limb + addi up, up, -4 C update up + addi rp, rp, -4 C update rp + slwi s1, v0, 1 + bdz L(end) C If done, skip loop + +L(loop): + lwz v1, 4(vp) C load v limb + adde s1, s1, u0 C add limbs with cy, set cy + srwi s0, v0, 31 C shift down previous v limb + stw s1, 4(rp) C store result limb + lwzu u0, 8(up) C load u limb and update up + rlwimi s0, v1, 1, 0,30 C left shift v limb and merge with prev v limb + + bdz L(exit) C decrement ctr and exit if done + + lwzu v0, 8(vp) C load v limb and update vp + adde s0, s0, u0 C add limbs with cy, set cy + srwi s1, v1, 31 C shift down previous v limb + stwu s0, 8(rp) C store result limb and update rp + lwz u0, 4(up) C load u limb + rlwimi s1, v0, 1, 0,30 C left shift v limb and merge with prev v limb + + bdnz L(loop) C decrement ctr and loop back + +L(end): adde r7, s1, u0 + srwi r4, v0, 31 + stw r7, 4(rp) C store last result limb + addze r3, r4 + blr +L(exit): + adde r7, s0, u0 + srwi r4, v1, 31 + stw r7, 8(rp) C store last result limb + addze r3, r4 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/addmul_1.asm b/gmp-6.3.0/mpn/powerpc32/addmul_1.asm new file mode 100644 index 0000000..07486df --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/addmul_1.asm @@ -0,0 +1,159 @@ +dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 6.75 +C 75x (G3): 8.7-14.3 +C 7400,7410 (G4): 8.7-14.3 +C 744x,745x (G4+): 9.5 +C power4/ppc970: 6.25 +C power5: 6.25 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +C This is optimized for the PPC604. It has not been tuned for other +C PowerPC processors. +C +C Loop Analysis for the 604: +C 12 mem insn +C 8 serializing insn +C 8 int multiply +C 25 int reg write +C 9 int ops (8 of which serialize) +C +C The multiply insns need 16 cycles/4limb. +C The integer register writes will need 13 cycles/4limb. +C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604, +C but that will require some clever FPNOPS and BNOPS for exact +C issue control. + + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpwi cr0,r5,9 C more than 9 limbs? + bgt cr0,L(big) C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + addc r8,r7,r9 + addi r3,r3,-4 + bdz L(end) +L(loop): + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + addc r8,r7,r9 + bdnz L(loop) +L(end): stw r8,4(r3) + addze r3,r10 + blr + +L(big): stwu r1,-16(r1) + addi r5,r5,-1 + stw r30,8(r1) + srwi r0,r5,2 + stw r31,12(r1) + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + addc r8,r8,r7 + stw r8,0(r3) + +L(loopU): + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stw r8,4(r3) + adde r9,r9,r12 + stw r9,8(r3) + adde r10,r10,r30 + stw r10,12(r3) + adde r11,r11,r31 + stwu r11,16(r3) + bdnz L(loopU) + + andi. r31,r5,3 + mtctr r31 + beq cr0,L(endx) + +L(loopE): + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stwu r8,4(r3) + bdnz L(loopE) +L(endx): + addze r3,r0 + lwz r30,8(r1) + lwz r31,12(r1) + addi r1,r1,16 + blr +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/aix.m4 b/gmp-6.3.0/mpn/powerpc32/aix.m4 new file mode 100644 index 0000000..fde2020 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/aix.m4 @@ -0,0 +1,82 @@ +divert(-1) +dnl m4 macros for AIX 32-bit assembly. + +dnl Copyright 2000-2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START', +` .toc') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl Don't want ELF style .size in the epilogue. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .globl $1 + .globl .$1 + .csect [DS], 2 +$1: + .long .$1, TOC[tc0], 0 + .csect [PR] + .align 2 +.$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +`') + +define(`TOC_ENTRY', `') + +define(`LEA', +m4_assert_numargs(2) +`define(`TOC_ENTRY', +` .toc +tc$2: + .tc $2[TC], $2')' +` lwz $1, tc$2(2)') + +define(`EXTERN', +m4_assert_numargs(1) +` .globl $1') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .csect [RO], 3 + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`ASM_END', `TOC_ENTRY') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/aors_n.asm b/gmp-6.3.0/mpn/powerpc32/aors_n.asm new file mode 100644 index 0000000..25ece09 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/aors_n.asm @@ -0,0 +1,157 @@ +dnl PowerPC-32 mpn_add_n and mpn_sub_n. + +dnl Copyright 2002, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? old: 3.25 +C 75x (G3): ? old: 3.5 +C 7400,7410 (G4): 3.25 +C 744x,745x (G4+): 4 +C POWER3/PPC630 2 +C POWER4/PPC970 2.4 +C POWER5 2.75 +C POWER6 40-140 +C POWER7 3 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') +define(`cy', `r7') + +ifdef(`OPERATION_add_n', ` + define(ADCSBC, adde) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(IFADD, `$1') + define(IFSUB, `')') +ifdef(`OPERATION_sub_n', ` + define(ADCSBC, subfe) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(IFADD, `') + define(IFSUB, `$1')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() + +PROLOGUE(func_nc) +IFADD(` addic r0, cy, -1') C set carry from argument +IFSUB(` subfic r0, cy, 0') C set carry from argument + b L(ent) +EPILOGUE() + +PROLOGUE(func) +IFADD(` addic r0, n, 0') C clear carry +IFSUB(` addic r0, n, -1') C set carry +L(ent): andi. r0, n, 3 + addi r3, r3, -12 + addi n, n, 1 + cmpwi cr7, r0, 2 + srwi r0, n, 2 + sub r4, r4, r3 + sub r5, r5, r3 + mtctr r0 + bne cr0, L(n00) + + lwzx r7, r4, r3 C n = 4, 8, 12, ... + lwzx r8, r5, r3 + addi r3, r3, 4 + lwzx r9, r4, r3 + ADCSBC r7, r8, r7 + lwzx r10, r5, r3 + addi r3, r3, 4 + b L(00) + +L(n00): bge cr7, L(n01) + cmpwi cr0, r0, 0 C n = 1, 5, 9, 13, ... + lwzx r0, r4, r3 + lwzx r6, r5, r3 + addi r3, r3, 4 + ADCSBC r0, r6, r0 + ble L(ret) +L(gt1): lwzx r7, r4, r3 + lwzx r8, r5, r3 + addi r3, r3, 4 + b L(01) + +L(n10): + lwzx r9, r4, r3 C n = 3, 7, 11, 15, ... + lwzx r10, r5, r3 + addi r3, r3, 4 + lwzx r11, r4, r3 + ADCSBC r9, r10, r9 + lwzx r12, r5, r3 + addi r3, r3, 4 + b L(11) + +L(n01): bne cr7, L(n10) + cmpwi cr0, r0, 0 C n = 2, 6, 10, 14, ... + lwzx r11, r4, r3 + lwzx r12, r5, r3 + addi r3, r3, 4 + lwzx r0, r4, r3 + ADCSBC r11, r12, r11 + lwzx r6, r5, r3 + addi r3, r3, 4 + ble cr0, L(end) + + +L(lp): lwzx r7, r4, r3 + ADCSBC r0, r6, r0 + lwzx r8, r5, r3 + stwu r11, 4(r3) +L(01): lwzx r9, r4, r3 + ADCSBC r7, r8, r7 + lwzx r10, r5, r3 + stwu r0, 4(r3) +L(00): lwzx r11, r4, r3 + ADCSBC r9, r10, r9 + lwzx r12, r5, r3 + stwu r7, 4(r3) +L(11): lwzx r0, r4, r3 + ADCSBC r11, r12, r11 + lwzx r6, r5, r3 + stwu r9, 4(r3) + bdnz L(lp) + +L(end): ADCSBC r0, r6, r0 + stw r11, 4(r3) +L(ret): stw r0, 8(r3) +IFADD(` li r3, 0 ') +IFADD(` addze r3, r3 ') +IFSUB(` subfe r3, r0, r0') +IFSUB(` neg r3, r3') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm new file mode 100644 index 0000000..72b2c48 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm @@ -0,0 +1,131 @@ +dnl PPC32 mpn_bdiv_dbm1c. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): 9.43 +C 744x,745x (G4+): 6.28 +C power4/ppc970: ? +C power5: ? + +C TODO +C * Nothing to do... + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`bd', `r6') +define(`cy', `r7') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + lwz r0, 0(r4) + + rlwinm. r12, r5, 0,30,31 + cmplwi cr6, r12, 2 + cmplwi cr7, r5, 4 + addi r5, r5, 1 + srwi r5, r5, 2 + mtctr r5 + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): mullw r5, r0, r6 + mulhwu r12, r0, r6 + lwz r0, 4(r4) + addi r4, r4, -12 + addi r3, r3, -12 + b L(3) + +L(b00): mullw r9, r0, r6 + mulhwu r8, r0, r6 + lwz r0, 4(r4) + addi r4, r4, -8 + addi r3, r3, -8 + b L(0) + +L(b01): mullw r5, r0, r6 + mulhwu r12, r0, r6 + addi r3, r3, -4 + ble cr7, L(e1) + lwz r0, 4(r4) + addi r4, r4, -4 + b L(1) + +L(b10): mullw r9, r0, r6 + mulhwu r8, r0, r6 + lwz r0, 4(r4) + ble cr7, L(e2) + + ALIGN(16) +L(top): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + lwz r0, 8(r4) + subfe r7, r8, r11 + stw r11, 0(r3) +L(1): mullw r9, r0, r6 + mulhwu r8, r0, r6 + subfc r11, r5, r7 + lwz r0, 12(r4) + subfe r7, r12, r11 + stw r11, 4(r3) +L(0): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + lwz r0, 16(r4) + subfe r7, r8, r11 + stw r11, 8(r3) +L(3): mullw r9, r0, r6 + mulhwu r8, r0, r6 + subfc r11, r5, r7 + lwz r0, 20(r4) + subfe r7, r12, r11 + stw r11, 12(r3) + addi r4, r4, 16 + addi r3, r3, 16 + bdnz L(top) + +L(e2): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + subfe r7, r8, r11 + stw r11, 0(r3) +L(e1): subfc r11, r5, r7 + stw r11, 4(r3) + subfe r3, r12, r11 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/darwin.m4 b/gmp-6.3.0/mpn/powerpc32/darwin.m4 new file mode 100644 index 0000000..db42268 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/darwin.m4 @@ -0,0 +1,91 @@ +divert(-1) +dnl m4 macros for Mac OS 32-bit assembly. + +dnl Copyright 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .text + .globl $1 + .align 3 +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1)) + + +dnl LEA -- Load Effective Address. + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC', +` mflr r0 C save return address + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, ha16($2-1b) + la $1, lo16($2-1b)($1) + mtlr r0 C restore return address +',` + lis $1, ha16($2) + la $1, lo16($2)($1) +')') + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .const + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/diveby3.asm b/gmp-6.3.0/mpn/powerpc32/diveby3.asm new file mode 100644 index 0000000..288a7d3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/diveby3.asm @@ -0,0 +1,93 @@ +dnl PowerPC-32 mpn_divexact_by3 -- mpn by 3 exact division + +dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 5 +C 75x (G3): ? +C 7400,7410 (G4): 8 +C 744x,745x (G4+): 6 +C power4/ppc970: 12 +C power5: ? + +C void mpn_divexact_by3 (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C We avoid the slow subfe instruction and instead rely on an extremely unlikely +C branch. +C +C The mullw has the inverse in the first operand, since 0xAA..AB won't allow +C any early-out. The src[] data normally won't either, but there's at least +C a chance, whereas 0xAA..AB never will. If, for instance, src[] is all +C zeros (not a sensible input of course) we run at 7.0 c/l on ppc750. +C +C The mulhwu has the "3" multiplier in the second operand, which lets 750 and +C 7400 use an early-out. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cy', `r6') + +ASM_START() +PROLOGUE(mpn_divexact_by3c) + lwz r11, 0(up) + mtctr n + lis r12, 0xAAAA + ori r12, r12, 0xAAAB + li r10, 3 + + cmplw cr7, cy, r11 + subf r11, cy, r11 + + mullw r0, r11, r12 + stw r0, 0(rp) + bdz L(one) + +L(top): lwzu r9, 4(up) + mulhwu r7, r0, r10 + bgt- cr7, L(adj) C very unlikely branch +L(bko): cmplw cr7, r7, r9 + subf r0, r7, r9 + mullw r0, r12, r0 + stwu r0, 4(rp) + bdnz L(top) + +L(one): mulhwu r3, r0, r10 + blelr+ cr7 + addi r3, r3, 1 + blr + +L(adj): addi r7, r7, 1 + b L(bko) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/divrem_2.asm b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm new file mode 100644 index 0000000..74423f4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm @@ -0,0 +1,182 @@ +dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm frac +C 7410 ~36.5 ~36.5 +C 744x, 745x 29 29 + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C d = r7 + +C TODO +C * Decrease register usage. +C * Make sure mul operands and optimal for early-out. +C * Check that things work well for a shared library build. +C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at +C least vastly improve the current __udiv_qrnnd_c based code. + + +ASM_START() +PROLOGUE(mpn_divrem_2) + stwu r1, -32(r1) + slwi r0, r6, 2 + add r5, r5, r0 + stmw r28, 8(r1) + addi r29, r5, -8 C up = up_param + un - 2 + lwz r10, 4(r7) + lwz r12, 4(r29) + addi r8, r3, -12 + lwz r7, 0(r7) + cmplw cr7, r12, r10 + lwz r28, 0(r29) + blt- cr7, L(2) + bgt+ cr7, L(4) + cmplw cr7, r28, r7 + blt- cr7, L(2) +L(4): subfc r28, r7, r28 + subfe r12, r10, r12 + li r3, 1 + b L(6) +L(2): li r3, 0 + +L(6): add r0, r4, r6 + addic. r30, r0, -2 + ble- cr0, L(ret) + + slwi r9, r0, 2 + add r8, r8, r9 C rp += un + fn + mtctr r30 + +C Compute di from d1 + srwi r11, r10, 16 + nor r0, r10, r10 + divwu r31, r0, r11 + rlwinm r5, r10, 0, 16, 31 + mullw r9, r11, r31 + mullw r6, r5, r31 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r6 + bge- cr7, L(9) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r6, r0 + addi r31, r31, -1 C q1-- + crorc 28, 28, 25 + blt+ cr7, L(9) + addi r31, r31, -1 C q1-- + add r0, r0, r10 +L(9): subf r0, r6, r0 + divwu r6, r0, r11 + mullw r9, r11, r6 + mullw r11, r5, r6 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r11 + bge- cr7, L(13) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r11, r0 + addi r6, r6, -1 C q0-- + crorc 28, 28, 25 + blt+ cr7, L(13) +C add r0, r0, r10 C final remainder + addi r6, r6, -1 C q0-- +L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient + +C Adjust di by including d0 + mullw r9, r10, r6 C t0 = LO(di * d1) + addc r11, r9, r7 + subfe r0, r1, r1 + mulhwu r9, r6, r7 C s1 = HI(di * d0) + addc r9, r11, r9 + addze. r0, r0 + blt cr0, L(17) +L(18): subfc r9, r10, r9 + addi r6, r6, -1 + addme. r0, r0 + bge+ cr0, L(18) +L(17): + +C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31 +C msl di d0 qp d1 fn up un +L(loop): + mullw r0, r12, r6 C q0 = LO(n2 * di) + cmpw cr7, r30, r4 + addc r31, r0, r28 C q0 += n1 + mulhwu r9, r12, r6 C q = HI(n2 * di) + adde r12, r9, r12 C q += n2 + addi r30, r30, -1 + mullw r0, r10, r12 C d1 * q + li r9, 0 + subf r0, r0, r28 C n1 -= d1 * q + addi r5, r12, 1 + ble- cr7, L(23) + lwzu r9, -4(r29) +L(23): mullw r11, r12, r7 C t0 = LO(d0 * q) + subfc r28, r7, r9 C n0 -= d0 + subfe r0, r10, r0 C n1 -= d1 + mulhwu r12, r12, r7 C t1 = HI(d0 * q) + subfc r28, r11, r28 C n0 -= t0 + subfe r12, r12, r0 C n1 -= t1 + cmplw cr7, r12, r31 + blt+ cr7, L(24) + addc r28, r28, r7 + adde r12, r12, r10 + addi r5, r5, -1 +L(24): cmplw cr7, r12, r10 + bge- cr7, L(fix) +L(bck): stw r5, 0(r8) + addi r8, r8, -4 + bdnz L(loop) + +L(ret): stw r28, 0(r29) + stw r12, 4(r29) + lmw r28, 8(r1) + addi r1, r1, 32 + blr + +L(fix): cmplw cr6, r28, r7 + bgt+ cr7, L(28) + blt- cr6, L(bck) +L(28): subfc r28, r7, r28 + subfe r12, r10, r12 + addi r5, r5, 1 + b L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/eabi.m4 b/gmp-6.3.0/mpn/powerpc32/eabi.m4 new file mode 100644 index 0000000..cd7633c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/eabi.m4 @@ -0,0 +1,86 @@ +divert(-1) +dnl m4 macros for powerpc32 eABI assembly. + +dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .section ".text" + .align 3 + .globl $1 + .type $1, @function +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .size $1, .-$1') + +dnl This ought to support PIC, but it is unclear how that is done for eABI +define(`LEA', +m4_assert_numargs(2) +` + lis $1, $2@ha + la $1, $2@l($1) +') + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` + .section .rodata + ALIGN(ifelse($#,1,2,$2)) + .type $1, @object +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +dnl 64-bit "long long" parameters are put in an even-odd pair, skipping an +dnl even register if that was in turn. I wish somebody could explain why that +dnl is a good idea. +define(`BROKEN_LONGLONG_PARAM') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/elf.m4 b/gmp-6.3.0/mpn/powerpc32/elf.m4 new file mode 100644 index 0000000..1ed9c12 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/elf.m4 @@ -0,0 +1,100 @@ +divert(-1) +dnl m4 macros for powerpc32 GNU/Linux assembly. + +dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .section ".text" + .align 3 + .globl $1 + .type $1, @function +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC',` + mflr r0 + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@ha + addi $1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@l + mtlr r0 + lwz $1, $2@got($1) +',` + lis $1, $2@ha + la $1, $2@l($1) +')') + + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` + .section .rodata + ALIGN(ifelse($#,1,2,$2)) + .type $1, @object +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +dnl 64-bit "long long" parameters are put in an even-odd pair, skipping an +dnl even register if that was in turn. I wish somebody could explain why that +dnl is a good idea. +define(`BROKEN_LONGLONG_PARAM') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h new file mode 100644 index 0000000..e835a39 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h @@ -0,0 +1,222 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2010, 2014, 2015 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* This file is supposed to be used for 604, 604e, 744x/745x/747x (G4+), i.e., + 32-bit PowerPC processors with reasonably fast integer multiply insns. The + values below are chosen to be best for the latter processors, since 604 is + largely irrelevant today. + + In mpn/powerpc32/750/gmp-mparam.h there are values for 75x (G3) and for + 7400/7410 (G4), both which have much slower multiply instructions. */ + +/* 1417 MHz PPC 7447A */ +/* FFT tuning limit = 15 M */ +/* Generated by tuneup.c, 2015-10-08, gcc 4.6 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 45 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 69 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 156 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 71 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 74 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 190 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 284, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 71, 8}, { 143, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135, 8}, { 271, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383, 9}, { 207, 8}, \ + { 415,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,10}, { 479, 9}, { 959,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703, 9}, { 1407,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 703,10}, { 1407,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 164 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 248 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 248, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207, 8}, { 415, 7}, { 831,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415, 8}, { 831,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,10}, { 447, 9}, \ + { 895,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703, 9}, { 1407,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1279,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1599,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3199,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 157 +#define SQR_FFT_THRESHOLD 2688 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 50 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 115 +#define SQRLO_SQR_THRESHOLD 5274 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 141 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 120 + +#define INV_MULMOD_BNM1_THRESHOLD 43 +#define INV_NEWTON_THRESHOLD 173 +#define INV_APPR_THRESHOLD 156 + +#define BINV_NEWTON_THRESHOLD 204 +#define REDC_1_TO_REDC_N_THRESHOLD 51 + +#define MU_DIV_QR_THRESHOLD 1017 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 84 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define POWM_SEC_TABLE 1,16,102,428,1378 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 781 +#define SET_STR_PRECOMPUTE_THRESHOLD 1505 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 12 +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 161 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 351 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/invert_limb.asm b/gmp-6.3.0/mpn/powerpc32/invert_limb.asm new file mode 100644 index 0000000..612bfe5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/invert_limb.asm @@ -0,0 +1,142 @@ +dnl PowerPC-32 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): ? +C 744x,745x (G4+): 32 +C power4/ppc970: ? +C power5: ? + +EXTERN(approx_tab) + +ASM_START() +PROLOGUE(mpn_invert_limb) + rlwinm r6, r3, 11, 22, 30 C extract bits 30..22 to pos 2^1 + srwi r10, r3, 11 C extract bits 31..11 + LEA( r9, approx_tab) C N.B. clobbers r0 for ELF and Darwin + lhzx r9, r9, r6 C w2 + addi r0, r10, 1 + mullw r11, r9, r9 + slwi r9, r9, 4 + mulhwu r7, r11, r0 + rlwinm r11, r3, 0, 31, 31 C extract bit 0 + addi r0, r9, -1 + srwi r9, r3, 1 C d >> 1 + subf r0, r7, r0 C w1 + add r9, r9, r11 C d31 + mullw r9, r0, r9 C w1 * d31 + srwi r10, r0, 1 C w1 >> 1 + neg r11, r11 + and r11, r10, r11 + subf r11, r9, r11 + mulhwu r9, r11, r0 + slwi r0, r0, 15 + srwi r9, r9, 1 + add r0, r9, r0 C w0 + mullw r10, r0, r3 + mulhwu r9, r0, r3 + addc r11, r10, r3 + adde r3, r9, r3 + subf r3, r3, r0 + blr +EPILOGUE() + +DEF_OBJECT(approx_tab) + .short 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27 + .short 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d + .short 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61 + .short 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894 + .short 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3 + .short 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520 + .short 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379 + .short 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de + .short 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e + .short 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8 + .short 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e + .short 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd + .short 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76 + .short 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918 + .short 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3 + .short 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676 + .short 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532 + .short 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5 + .short 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1 + .short 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193 + .short 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d + .short 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d + .short 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35 + .short 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22 + .short 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16 + .short 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10 + .short 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f + .short 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914 + .short 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f + .short 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e + .short 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643 + .short 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d + .short 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b + .short 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e + .short 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6 + .short 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1 + .short 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121 + .short 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056 + .short 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e + .short 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca + .short 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09 + .short 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d + .short 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93 + .short 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde + .short 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b + .short 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c + .short 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0 + .short 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927 + .short 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881 + .short 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de + .short 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e + .short 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1 + .short 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606 + .short 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e + .short 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8 + .short 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445 + .short 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5 + .short 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327 + .short 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b + .short 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211 + .short 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a + .short 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104 + .short 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081 + .short 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000 +END_OBJECT(approx_tab) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/lshift.asm b/gmp-6.3.0/mpn/powerpc32/lshift.asm new file mode 100644 index 0000000..ce85d4d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/lshift.asm @@ -0,0 +1,168 @@ +dnl PowerPC-32 mpn_lshift -- Shift a number left. + +dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_lshift) + cmpwi cr0, r5, 30 C more than 30 limbs? + slwi r0, r5, 2 + add r4, r4, r0 C make r4 point at end of s1 + add r7, r3, r0 C make r7 point at end of res + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwzu r11, -4(r4) C load first s1 limb + srw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, -4(r4) + slw r9, r11, r6 + srw r12, r10, r8 + or r9, r9, r12 + stwu r9, -4(r7) + bdz L(end2) + lwzu r11, -4(r4) + slw r9, r10, r6 + srw r12, r11, r8 + or r9, r9, r12 + stwu r9, -4(r7) + bdnz L(oop) + +L(end1): + slw r0, r11, r6 + stw r0, -4(r7) + blr +L(end2): + slw r0, r10, r6 + stw r0, -4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwzu r9, -4(r4) + subfic r8, r6, 32 + srw r3, r9, r8 C compute function return value + slw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, -4(r4) + bdz L(xe0) + +L(loop0): + slw r12, r28, r6 + srw r24, r28, r8 + lwzu r28, -4(r4) + or r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): slw r12, r28, r6 + srw r24, r28, r8 + or r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, -4(r4) + lwz r29, -8(r4) + lwz r30, -12(r4) + lwzu r31, -16(r4) + +L(loopU): + slw r9, r28, r6 + srw r24, r28, r8 + lwz r28, -4(r4) + slw r10, r29, r6 + srw r25, r29, r8 + lwz r29, -8(r4) + slw r11, r30, r6 + srw r26, r30, r8 + lwz r30, -12(r4) + slw r12, r31, r6 + srw r27, r31, r8 + lwzu r31, -16(r4) + or r24, r0, r24 + stw r24, -4(r7) + or r25, r9, r25 + stw r25, -8(r7) + or r26, r10, r26 + stw r26, -12(r7) + or r27, r11, r27 + stwu r27, -16(r7) + mr r0, r12 + bdnz L(loopU) + + slw r9, r28, r6 + srw r24, r28, r8 + slw r10, r29, r6 + srw r25, r29, r8 + slw r11, r30, r6 + srw r26, r30, r8 + slw r12, r31, r6 + srw r27, r31, r8 + or r24, r0, r24 + stw r24, -4(r7) + or r25, r9, r25 + stw r25, -8(r7) + or r26, r10, r26 + stw r26, -12(r7) + or r27, r11, r27 + stw r27, -16(r7) + + stw r12, -20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/lshiftc.asm b/gmp-6.3.0/mpn/powerpc32/lshiftc.asm new file mode 100644 index 0000000..b683def --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/lshiftc.asm @@ -0,0 +1,170 @@ +dnl PowerPC-32 mpn_lshiftc. + +dnl Copyright 1995, 1998, 2000, 2002-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_lshiftc) + cmpwi cr0, r5, 30 C more than 30 limbs? + slwi r0, r5, 2 + add r4, r4, r0 C make r4 point at end of s1 + add r7, r3, r0 C make r7 point at end of res + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwzu r11, -4(r4) C load first s1 limb + srw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, -4(r4) + slw r9, r11, r6 + srw r12, r10, r8 + nor r9, r9, r12 + stwu r9, -4(r7) + bdz L(end2) + lwzu r11, -4(r4) + slw r9, r10, r6 + srw r12, r11, r8 + nor r9, r9, r12 + stwu r9, -4(r7) + bdnz L(oop) + +L(end1): + slw r0, r11, r6 + nor r0, r0, r0 + stw r0, -4(r7) + blr +L(end2): + slw r0, r10, r6 + nor r0, r0, r0 + stw r0, -4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwzu r9, -4(r4) + subfic r8, r6, 32 + srw r3, r9, r8 C compute function return value + slw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, -4(r4) + bdz L(xe0) + +L(loop0): + slw r12, r28, r6 + srw r24, r28, r8 + lwzu r28, -4(r4) + nor r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): slw r12, r28, r6 + srw r24, r28, r8 + nor r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, -4(r4) + lwz r29, -8(r4) + lwz r30, -12(r4) + lwzu r31, -16(r4) + +L(loopU): + slw r9, r28, r6 + srw r24, r28, r8 + lwz r28, -4(r4) + slw r10, r29, r6 + srw r25, r29, r8 + lwz r29, -8(r4) + slw r11, r30, r6 + srw r26, r30, r8 + lwz r30, -12(r4) + slw r12, r31, r6 + srw r27, r31, r8 + lwzu r31, -16(r4) + nor r24, r0, r24 + stw r24, -4(r7) + nor r25, r9, r25 + stw r25, -8(r7) + nor r26, r10, r26 + stw r26, -12(r7) + nor r27, r11, r27 + stwu r27, -16(r7) + mr r0, r12 + bdnz L(loopU) + + slw r9, r28, r6 + srw r24, r28, r8 + slw r10, r29, r6 + srw r25, r29, r8 + slw r11, r30, r6 + srw r26, r30, r8 + slw r12, r31, r6 + srw r27, r31, r8 + nor r24, r0, r24 + stw r24, -4(r7) + nor r25, r9, r25 + stw r25, -8(r7) + nor r26, r10, r26 + stw r26, -12(r7) + nor r27, r11, r27 + stw r27, -16(r7) + nor r12, r12, r12 + stw r12, -20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm new file mode 100644 index 0000000..6d7fe4d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm @@ -0,0 +1,145 @@ +dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. + +dnl Copyright 2002, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 3 +C 75x (G3): 3 +C 7400,7410 (G4): 3 +C 744x,745x (G4+): 3 +C power4/ppc970: 2.5 +C power5: 2.5 + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C There seems no need to schedule the loads back, the code is still 3.0 c/l +C on 750/7400 no matter where they're placed. +C +C Alternatives: +C +C Fetching half words would allow add instead for accumulating, instead of +C adde and its serialization. An outer loop would be required though, since +C 2^16 halfwords can overflow. lhz+add would be 2.0 c/l, but if there's +C also a bdz or bdnz for each and a pointer update say every three limbs +C then the total would be 2.67 c/l which isn't much faster than the current +C simpler code. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + + C r3 src + C r4 size + + mtctr r4 + addic r6, r3, 8 C &src[2], and clear CA + + lwz r3, 0(r3) C acc0 = src[0] + bdz L(done) + + lwz r4, -4(r6) C acc1 = src[1] + bdz L(two) + + lwz r5, 0(r6) C acc2 = src[2] + lis r7, 0 C no carry if just three limbs + + bdz L(three) + lis r7, 1 C 0x10000 carry pos + +L(top): + C r3 acc0 + C r4 acc1 + C r5 acc2 + C r6 src, incrementing + C r7 carry pos + + lwz r0, 4(r6) + adde r3, r3, r0 + bdz L(end0) + + lwz r0, 8(r6) + adde r4, r4, r0 + bdz L(end1) + + lwzu r0, 12(r6) + adde r5, r5, r0 + bdnz L(top) + + + srwi r7, r7, 8 +L(end0): + srwi r7, r7, 8 +L(end1): + subfe r0, r0, r0 C -1 if not CA + + andc r7, r7, r0 C final carry, 0x10000, 0x100, 1 or 0 +L(three): + rlwinm r6, r3, 0,8,31 C acc0 low + + add r7, r7, r6 + rlwinm r6, r3, 8,24,31 C acc0 high + + add r7, r7, r6 + rlwinm r6, r4, 8,8,23 C acc1 low + + add r7, r7, r6 + rlwinm r6, r4, 16,16,31 C acc1 high + + add r7, r7, r6 + rlwinm r6, r5, 16,8,15 C acc2 low + + add r7, r7, r6 + rlwinm r6, r5, 24,8,31 C acc2 high + + add r3, r7, r6 + +L(done): + blr + +L(two): + C r3 acc0 + C r4 acc1 + + rlwinm r5, r3, 8,24,31 C acc0 high + rlwinm r3, r3, 0,8,31 C acc0 low + + add r3, r3, r5 C acc0 high + low + rlwinm r5, r4, 16,16,31 C acc1 high + + add r3, r3, r5 C add acc1 high + rlwinm r5, r4, 8,8,23 C acc1 low + + add r3, r3, r5 C add acc1 low + + blr + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/mode1o.asm b/gmp-6.3.0/mpn/powerpc32/mode1o.asm new file mode 100644 index 0000000..e8a6b5e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mode1o.asm @@ -0,0 +1,127 @@ +dnl PowerPC-32 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 6.0 +C 75x (G3): 6.0-13.0, depending on divisor +C 7400,7410 (G4): 6.0-13.0, depending on divisor +C 744x,745x (G4+): 8.0-10.0, depending on divisor +C power4/ppc970: 12.0 +C power5: 12.0 + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C For PIC, the inverse is established arithmetically since it measures about +C 5 cycles faster than the nonsense needed to access binvert_limb_table in +C SVR4 or Darwin style PIC. AIX might be better, since it avoids bl/mflr to +C get at the GOT/TOC/whatever. +C +C Using divwu for size==1 measured about 10 cycles slower on 604e, or about +C 3-5 cycles faster on 750. For now it doesn't seem worth bothering with. +C +C The loop allows an early-out on mullw for the inverse, and on mulhwu for +C the divisor. So the fastest is for instance divisor==1 (inverse==-1), and +C the slowest is anything giving a full 32-bits in both, such as +C divisor==0xDEADBEEF (inverse==0x904B300F). These establish the stated +C range above for 750 and 7400. + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_modexact_1_odd) + li r6, 0 + +PROLOGUE(mpn_modexact_1c_odd) + + mtctr r4 C size + +ifdef(`PIC_SLOW',` +C Load from our table with PIC is so slow on Linux and Darwin that we avoid it + rlwinm r7, r5, 1,28,28 C (divisor << 1) & 8 + rlwinm r8, r5, 2,28,28 C (divisor << 2) & 8 + xor r7, r7, r8 C ((divisor << 1) ^ (divisor << 2)) & 8 + rlwinm r4, r5, 0,28,31 C divisor low 4 bits, speedup mullw + xor r4, r4, r7 C inverse, 4 bits + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + rlwinm r8, r5, 0,24,31 C divisor low 8 bits, speedup mullw + mullw r7, r7, r8 C i*i*d + sub r4, r4, r7 C inverse, 8 bits +',` + LEA( r7, binvert_limb_table) + rlwinm r4, r5, 31,25,31 C (divisor/2) & 0x7F + lbzx r4, r4,r7 C inverse, 8 bits +') + + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + mullw r7, r5, r7 C i*i*d [i*i is 16 bits, so second operand] + sub r4, r4, r7 C inverse, 16 bits + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + mullw r7, r7, r5 C i*i*d + lwz r0, 0(r3) C src[0] + sub r4, r4, r7 C inverse, 32 bits + subfc r7, r6, r0 C l = src[0] - carry + + mullw r7, r7, r4 C q = l * inverse + bdz L(one) + + lwzu r0, 4(r3) C src[1] + mulhwu r6, r7, r5 C carry = high(q*divisor) + subfe r7, r6, r0 C l = src[1] - carry + bdz L(two) + +L(top): + mullw r7, r7, r4 C q = l * inverse + lwzu r0, 4(r3) C src[i] + mulhwu r6, r7, r5 C carry = high(q*divisor) + subfe r7, r6, r0 C l = src[i] - carry + bdnz L(top) + +L(two): mullw r7, r7, r4 C q = l * inverse +L(one): subfe r3, r3, r3 C ca 0 or -1 + mulhwu r6, r7, r5 C carry = high(q*divisor) + subf r3, r3, r6 C carry + ca + blr + +EPILOGUE(mpn_modexact_1c_odd) +EPILOGUE(mpn_modexact_1_odd) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/mul_1.asm b/gmp-6.3.0/mpn/powerpc32/mul_1.asm new file mode 100644 index 0000000..e42087c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mul_1.asm @@ -0,0 +1,101 @@ +dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1995, 1997, 2000, 2002, 2003, 2005 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 4.5-11 +C 7400,7410 (G4): 4.5-11 +C 744x,745x (G4+): 6.0 +C power4/ppc970: 6.0 +C power5: 5.63 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr r5 + addi r3,r3,-4 C adjust res_ptr, it's offset before it's used + li r12,0 C clear upper product reg + addic r0,r0,0 C clear cy +C Start software pipeline + lwz r8,0(r4) + bdz L(end3) + lwzu r9,4(r4) + mullw r11,r8,r6 + mulhwu r0,r8,r6 + bdz L(end1) +C Software pipelined main loop +L(loop): + lwz r8,4(r4) + mullw r10,r9,r6 + adde r5,r11,r12 + mulhwu r12,r9,r6 + stw r5,4(r3) + bdz L(end2) + lwzu r9,8(r4) + mullw r11,r8,r6 + adde r7,r10,r0 + mulhwu r0,r8,r6 + stwu r7,8(r3) + bdnz L(loop) +C Finish software pipeline +L(end1): + mullw r10,r9,r6 + adde r5,r11,r12 + mulhwu r12,r9,r6 + stw r5,4(r3) + adde r7,r10,r0 + stwu r7,8(r3) + addze r3,r12 + blr +L(end2): + mullw r11,r8,r6 + adde r7,r10,r0 + mulhwu r0,r8,r6 + stwu r7,8(r3) + adde r5,r11,r12 + stw r5,4(r3) + addze r3,r0 + blr +L(end3): + mullw r11,r8,r6 + stw r11,4(r3) + mulhwu r3,r8,r6 + blr +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm b/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm new file mode 100644 index 0000000..3b6685e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm @@ -0,0 +1,187 @@ +dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.78 +C POWER7 2.15-2.87 + +C This code is based on powerpc64/aors_n.asm. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): stwu r1, -32(r1) + rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0 + cmpwi cr6, r0, 2 + stw r28, 8(r1) + addi r6, r6, 3 C compute count... + stw r29, 12(r1) + srwi r6, r6, 2 C ...for ctr + stw r30, 16(r1) + mtctr r6 C copy count into ctr + stw r31, 20(r1) + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): lwz r8, 0(r4) C load s1 limb + lwz r9, 0(r5) C load s2 limb + lwz r10, 4(r4) C load s1 limb + lwz r11, 4(r5) C load s2 limb + lwz r12, 8(r4) C load s1 limb + addi r4, r4, 12 + lwz r0, 8(r5) C load s2 limb + addi r5, r5, 12 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r29, 0(r3) + stw r30, 4(r3) + stw r31, 8(r3) + addi r3, r3, 12 + bdnz L(go) + b L(ret) + +L(b01): lwz r12, 0(r4) C load s1 limb + addi r4, r4, 4 + lwz r0, 0(r5) C load s2 limb + addi r5, r5, 4 + ADDSUBC r31, r0, r12 C add + stw r31, 0(r3) + addi r3, r3, 4 + bdnz L(go) + b L(ret) + +L(b10): lwz r10, 0(r4) C load s1 limb + lwz r11, 0(r5) C load s2 limb + lwz r12, 4(r4) C load s1 limb + addi r4, r4, 8 + lwz r0, 4(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + stw r30, 0(r3) + stw r31, 4(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 16 + addi r5, r5, 16 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + ADDSUBC r30, r11, r10 + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + ADDSUBC r31, r0, r12 + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + stw r28, 0(r3) + addi r4, r4, 16 + stw r29, 4(r3) + addi r5, r5, 16 + stw r30, 8(r3) + stw r31, 12(r3) + addi r3, r3, 16 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r28, 0(r3) + stw r29, 4(r3) + stw r30, 8(r3) + stw r31, 12(r3) + +L(ret): + lwz r28, 8(r1) + lwz r29, 12(r1) + subfe r3, r0, r0 C -cy + lwz r30, 16(r1) + GENRVAL + lwz r31, 20(r1) + addi r1, r1, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h new file mode 100644 index 0000000..3382695 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h @@ -0,0 +1,155 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 450 MHz POWER3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 58 +#define MUL_TOOM6H_THRESHOLD 129 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 59 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 53 +#define SQR_TOOM4_THRESHOLD 76 +#define SQR_TOOM6_THRESHOLD 106 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 23, 9}, { 15, 8}, { 35, 9}, \ + { 23,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 71, 8}, { 143, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 143,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703, 8}, \ + { 1407,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447, 9}, { 895,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 82 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 176 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 176, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 31, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 383,11}, \ + { 223,10}, { 447, 9}, { 895,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 123 +#define DC_BDIV_QR_THRESHOLD 34 +#define DC_BDIV_Q_THRESHOLD 84 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 129 +#define INV_APPR_THRESHOLD 124 + +#define BINV_NEWTON_THRESHOLD 148 +#define REDC_1_TO_REDC_N_THRESHOLD 38 + +#define MU_DIV_QR_THRESHOLD 748 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 59 +#define MU_BDIV_QR_THRESHOLD 562 +#define MU_BDIV_Q_THRESHOLD 654 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 76 +#define GCD_DC_THRESHOLD 205 +#define GCDEXT_DC_THRESHOLD 174 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 181 +#define SET_STR_PRECOMPUTE_THRESHOLD 525 diff --git a/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h new file mode 100644 index 0000000..7ac59f5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h @@ -0,0 +1,209 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011, 2014 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* 1800 MHz PowerPC-970 */ +/* FFT tuning limit = 10000000 */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 42 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 45 + +#define DIV_1_VS_MUL_1_PERCENT 225 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 107 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 92 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 197 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 17, 6}, { 9, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191, 8}, { 383,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575, 9}, \ + { 303,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 335, 9}, { 671, 8}, { 1343,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 831,12}, \ + { 447,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1727,10}, { 3455,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,11}, { 3455,12}, { 1791,14}, { 511,13}, \ + { 1151,12}, { 2431,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 157 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 28, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 543, 9}, { 1087,11}, { 287,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 150 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 55 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 169 +#define SQRLO_SQR_THRESHOLD 9335 + +#define DC_DIV_QR_THRESHOLD 50 +#define DC_DIVAPPR_Q_THRESHOLD 196 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 202 + +#define BINV_NEWTON_THRESHOLD 228 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1187 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 114 +#define MU_BDIV_QR_THRESHOLD 998 +#define MU_BDIV_Q_THRESHOLD 1142 + +#define POWM_SEC_TABLE 3,28,78,480,1099 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1002 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 28 + +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD_THRESHOLD 93 +#define HGCD_APPR_THRESHOLD 109 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 379 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h new file mode 100644 index 0000000..faa1e81 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h @@ -0,0 +1,156 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1650 MHz POWER5 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 61 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 40 +#define SQR_TOOM3_THRESHOLD 77 +#define SQR_TOOM4_THRESHOLD 124 +#define SQR_TOOM6_THRESHOLD 140 +#define SQR_TOOM8_THRESHOLD 238 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define POWM_SEC_TABLE 4,29,252,840,2080 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 71 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \ + { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 76 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 9236 + +#define DC_DIV_QR_THRESHOLD 69 +#define DC_DIVAPPR_Q_THRESHOLD 220 +#define DC_BDIV_QR_THRESHOLD 75 +#define DC_BDIV_Q_THRESHOLD 188 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 230 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 87 + +#define MU_DIV_QR_THRESHOLD 1210 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1210 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 2578 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 298 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 527 +#define SET_STR_PRECOMPUTE_THRESHOLD 1090 diff --git a/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h new file mode 100644 index 0000000..c9504b6 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h @@ -0,0 +1,165 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3500 MHz POWER6 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 55 +#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM6H_THRESHOLD 137 +#define MUL_TOOM8H_THRESHOLD 181 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 56 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 296 + +#define MULMID_TOOM42_THRESHOLD 26 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define POWM_SEC_TABLE 2,26,127,453,1068 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \ + { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \ + { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 92 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 187 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 146 + +#define INV_MULMOD_BNM1_THRESHOLD 68 +#define INV_NEWTON_THRESHOLD 182 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 186 +#define REDC_1_TO_REDC_N_THRESHOLD 60 + +#define MU_DIV_QR_THRESHOLD 924 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 823 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 974 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 134 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 190 +#define SET_STR_PRECOMPUTE_THRESHOLD 411 diff --git a/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h new file mode 100644 index 0000000..ad48dac --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h @@ -0,0 +1,170 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 4150 MHz POWER8/T4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2017-02-18, gcc 6.1 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 2 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD 3 +#define DIV_QR_2_PI2_THRESHOLD 15 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 39 + +#define DIV_1_VS_MUL_1_PERCENT 343 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 236 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 21, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 17, 6}, \ + { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351,11}, { 191,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 70 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 332 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 332, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 36 +#define MULLO_MUL_N_THRESHOLD 8648 +#define SQRLO_BASECASE_THRESHOLD 5 +#define SQRLO_DC_THRESHOLD 193 +#define SQRLO_SQR_THRESHOLD 6675 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 134 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 134 + +#define INV_MULMOD_BNM1_THRESHOLD 66 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 3,25,114,480,1486 + +#define GET_STR_DC_THRESHOLD 8 +#define GET_STR_PRECOMPUTE_THRESHOLD 14 +#define SET_STR_DC_THRESHOLD 644 +#define SET_STR_PRECOMPUTE_THRESHOLD 1365 + +#define FAC_DSC_THRESHOLD 107 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 95 +#define HGCD_APPR_THRESHOLD 121 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 456 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 b/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 new file mode 100644 index 0000000..6a61278 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 @@ -0,0 +1,128 @@ +divert(-1) + +dnl m4 macros for PowerPC assembler (32 and 64 bit). + +dnl Copyright 2000, 2002, 2003, 2017, 2018, 2020 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl This is the same as the default in mpn/asm-defs.m4, but with ALIGN(4) +dnl not 8. +dnl +dnl 4-byte alignment is normally enough, certainly it's what gcc gives. We +dnl don't want bigger alignment within PROLOGUE since it can introduce +dnl padding into multiple-entrypoint routines, and with gas such padding is +dnl zero words, which are not valid instructions. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` TEXT + ALIGN(4) + GLOBL `$1' GLOBL_ATTR + TYPE(`$1',`function') +`$1'LABEL_SUFFIX') + + +dnl Usage: r0 ... r31, cr0 ... cr7 +dnl +dnl Registers names, either left as "r0" etc or mapped to plain 0 etc, +dnl according to the result of the GMP_ASM_POWERPC_REGISTERS configure +dnl test. + +ifelse(WANT_R_REGISTERS,no,` +forloop(i,0,31,`deflit(`r'i,i)') +forloop(i,0,31,`deflit(`v'i,i)') +forloop(i,0,31,`deflit(`f'i,i)') +forloop(i,0,7, `deflit(`cr'i,i)') +') + + +dnl Usage: ASSERT(cond,instructions) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(eq, `cmpwi r6, 123') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(ne) +dnl +dnl The condition can be omitted to just output the given instructions when +dnl assertion checking is wanted. For example, +dnl +dnl ASSERT(, `mr r11, r0') +dnl +dnl Using a zero word for an illegal instruction is probably not ideal, +dnl since it marks the beginning of a traceback table in the 64-bit ABI. +dnl But assertions are only for development, so it doesn't matter too much. + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, + `C ASSERT + $2 +ifelse(`$1',,, +` b$1 L(ASSERT_ok`'ASSERT_counter) + W32 0 C assertion failed +L(ASSERT_ok`'ASSERT_counter): +define(`ASSERT_counter',incr(ASSERT_counter)) +')')') + +define(ASSERT_counter,1) + +dnl Manually assemble some new instructions +dnl + +define(`maddld',m4_assert_numargs(4)`dnl +.long eval(0x10000033+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))') + +define(`maddhdu',m4_assert_numargs(4)`dnl +.long eval(0x10000031+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))') + +define(`popcntd',m4_assert_numargs(2)`dnl +.long eval(0x7c0003f4+m4_lshift($2,21)+m4_lshift($1,16))') + +define(`divdeu',m4_assert_numargs(3)`dnl +.long eval(0x7c000312+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +define(`addex',m4_assert_numargs(4)`dnl +.long eval(0x7c000154+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,9))') + +define(`aese',m4_assert_numargs(3)`dnl +.long eval(0x10000508+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +define(`aeselst',m4_assert_numargs(3)`dnl +.long eval(0x10000509+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/rshift.asm b/gmp-6.3.0/mpn/powerpc32/rshift.asm new file mode 100644 index 0000000..d86cdcb --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/rshift.asm @@ -0,0 +1,166 @@ +dnl PowerPC-32 mpn_rshift -- Shift a number right. + +dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_rshift) + cmpwi cr0, r5, 30 C more than 30 limbs? + addi r7, r3, -4 C dst-4 + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwz r11, 0(r4) C load first s1 limb + slw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, 4(r4) + srw r9, r11, r6 + slw r12, r10, r8 + or r9, r9, r12 + stwu r9, 4(r7) + bdz L(end2) + lwzu r11, 4(r4) + srw r9, r10, r6 + slw r12, r11, r8 + or r9, r9, r12 + stwu r9, 4(r7) + bdnz L(oop) + +L(end1): + srw r0, r11, r6 + stw r0, 4(r7) + blr +L(end2): + srw r0, r10, r6 + stw r0, 4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwz r9, 0(r4) + subfic r8, r6, 32 + slw r3, r9, r8 C compute function return value + srw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, 4(r4) + bdz L(xe0) + +L(loop0): + srw r12, r28, r6 + slw r24, r28, r8 + lwzu r28, 4(r4) + or r24, r0, r24 + stwu r24, 4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): srw r12, r28, r6 + slw r24, r28, r8 + or r24, r0, r24 + stwu r24, 4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, 4(r4) + lwz r29, 8(r4) + lwz r30, 12(r4) + lwzu r31, 16(r4) + +L(loopU): + srw r9, r28, r6 + slw r24, r28, r8 + lwz r28, 4(r4) + srw r10, r29, r6 + slw r25, r29, r8 + lwz r29, 8(r4) + srw r11, r30, r6 + slw r26, r30, r8 + lwz r30, 12(r4) + srw r12, r31, r6 + slw r27, r31, r8 + lwzu r31, 16(r4) + or r24, r0, r24 + stw r24, 4(r7) + or r25, r9, r25 + stw r25, 8(r7) + or r26, r10, r26 + stw r26, 12(r7) + or r27, r11, r27 + stwu r27, 16(r7) + mr r0, r12 + bdnz L(loopU) + + srw r9, r28, r6 + slw r24, r28, r8 + srw r10, r29, r6 + slw r25, r29, r8 + srw r11, r30, r6 + slw r26, r30, r8 + srw r12, r31, r6 + slw r27, r31, r8 + or r24, r0, r24 + stw r24, 4(r7) + or r25, r9, r25 + stw r25, 8(r7) + or r26, r10, r26 + stw r26, 12(r7) + or r27, r11, r27 + stw r27, 16(r7) + + stw r12, 20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm b/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm new file mode 100644 index 0000000..d50718e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm @@ -0,0 +1,143 @@ +dnl PowerPC-32 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): 2.5 +C 744x,745x (G4+): 2.0 +C power4/ppc970: 2.0 +C power5: ? + +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`i', `r8') +define(`j', `r9') +define(`stride', `r12') +define(`mask', `r11') + + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + stwu r1, -32(r1) + addic. j, n, -4 C outer loop induction variable + stmw r27, 8(r1) + slwi stride, n, 2 + + blt cr0, L(outer_end) +L(outer_top): + mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + li r30, 0 + li r31, 0 + addic. j, j, -4 C outer loop induction variable + mr i, which + + ALIGN(16) +L(top): addic i, i, -1 C set carry iff i != 0 + subfe mask, mask, mask + lwz r0, 0(tp) + lwz r27, 4(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + lwz r0, 8(tp) + lwz r27, 12(tp) + and r0, r0, mask + and r27, r27, mask + or r30, r30, r0 + or r31, r31, r27 + add tp, tp, stride + bdnz L(top) + + stw r28, 0(rp) + stw r29, 4(rp) + stw r30, 8(rp) + stw r31, 12(rp) + addi tp, r10, 16 + addi rp, rp, 16 + bge cr0, L(outer_top) +L(outer_end): + + andi. r0, n, 2 + beq cr0, L(b0x) +L(b1x): mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + mr i, which + ALIGN(16) +L(tp2): addic i, i, -1 + subfe mask, mask, mask + lwz r0, 0(tp) + lwz r27, 4(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + add tp, tp, stride + bdnz L(tp2) + stw r28, 0(rp) + stw r29, 4(rp) + addi tp, r10, 8 + addi rp, rp, 8 + +L(b0x): andi. r0, n, 1 + beq cr0, L(b00) +L(b01): mtctr nents + mr r10, tp + li r28, 0 + mr i, which + ALIGN(16) +L(tp1): addic i, i, -1 + subfe mask, mask, mask + lwz r0, 0(tp) + and r0, r0, mask + or r28, r28, r0 + add tp, tp, stride + bdnz L(tp1) + stw r28, 0(rp) + +L(b00): lmw r27, 8(r1) + addi r1, r1, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..f7aba33 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm @@ -0,0 +1,80 @@ +dnl PowerPC-32 mpn_sqr_diag_addlsh1. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e ? +C 604e ? +C 75x (G3) ? +C 7400,7410 (G4) ? +C 744x,745x (G4+) 6 +C power4/ppc970 ? +C power5 ? + +C This has been feebly optimised for 7447 but not for any other CPU. + +define(`rp', r3) +define(`tp', r4) +define(`up', r5) +define(`n', r6) + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + addi n, n, -1 + addi tp, tp, -4 + mtctr n + lwz r0, 0(up) + li r10, 0 + mullw r7, r0, r0 + stw r7, 0(rp) + mulhwu r6, r0, r0 + addic r31, r31, 0 C clear CF + + ALIGN(16) +L(top): lwzu r0, 4(up) + mullw r7, r0, r0 + lwz r8, 4(tp) + lwzu r9, 8(tp) + rlwimi r10, r8, 1,0,30 + srwi r11, r8, 31 + rlwimi r11, r9, 1,0,30 + adde r10, r10, r6 + adde r11, r11, r7 + stw r10, 4(rp) + srwi r10, r9, 31 + mulhwu r6, r0, r0 + stwu r11, 8(rp) + bdnz L(top) + + adde r10, r10, r6 + stw r10, 4(rp) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm b/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm new file mode 100644 index 0000000..6dc6460 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm @@ -0,0 +1,101 @@ +dnl PowerPC-32 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) + +dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 5.0 +C 7400,7410 (G4): 5.0 +C 744x,745x (G4+): 5.0 +C power4/ppc970: 4.25 +C power5: 5.0 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +define(`rp',`r3') +define(`up',`r4') +define(`vp',`r5') + +define(`s0',`r6') +define(`s1',`r7') +define(`u0',`r8') +define(`v0',`r10') +define(`v1',`r11') + +ASM_START() +PROLOGUE(mpn_sublsh1_n) + mtctr r6 C copy n in ctr + + lwz v0, 0(vp) C load v limb + lwz u0, 0(up) C load u limb + addic up, up, -4 C update up; set cy + addi rp, rp, -4 C update rp + slwi s1, v0, 1 + bdz L(end) C If done, skip loop + +L(loop): + lwz v1, 4(vp) C load v limb + subfe s1, s1, u0 C add limbs with cy, set cy + srwi s0, v0, 31 C shift down previous v limb + stw s1, 4(rp) C store result limb + lwzu u0, 8(up) C load u limb and update up + rlwimi s0, v1, 1, 0,30 C left shift v limb and merge with prev v limb + + bdz L(exit) C decrement ctr and exit if done + + lwzu v0, 8(vp) C load v limb and update vp + subfe s0, s0, u0 C add limbs with cy, set cy + srwi s1, v1, 31 C shift down previous v limb + stwu s0, 8(rp) C store result limb and update rp + lwz u0, 4(up) C load u limb + rlwimi s1, v0, 1, 0,30 C left shift v limb and merge with prev v limb + + bdnz L(loop) C decrement ctr and loop back + +L(end): subfe r7, s1, u0 + srwi r4, v0, 31 + stw r7, 4(rp) C store last result limb + subfze r3, r4 + neg r3, r3 + blr +L(exit): + subfe r7, s0, u0 + srwi r4, v1, 31 + stw r7, 8(rp) C store last result limb + subfze r3, r4 + neg r3, r3 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/submul_1.asm b/gmp-6.3.0/mpn/powerpc32/submul_1.asm new file mode 100644 index 0000000..8ef37b0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/submul_1.asm @@ -0,0 +1,151 @@ +dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 7.5 +C 75x (G3): 9.3-15 +C 7400,7410 (G4): 9.3-15 +C 744x,745x (G4+): 10.5 +C power4/ppc970: 6.75 +C power5: 6.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +C This is optimized for the PPC604. See addmul_1.asm for additional comments. + +ASM_START() +PROLOGUE(mpn_submul_1) + cmpwi cr0,r5,9 C more than 9 limbs? + bgt cr0,L(big) C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + addi r3,r3,-4 + bdz L(end) +L(loop): + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + bdnz L(loop) +L(end): stw r8,4(r3) + addze r3,r10 + blr + +L(big): stwu r1,-16(r1) + addi r5,r5,-1 + stw r30,8(r1) + srwi r0,r5,2 + stw r31,12(r1) + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + subfc r7,r8,r7 + addc r8,r8,r7 + stw r7,0(r3) + +L(loopU): + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + stw r7,4(r3) + subfe r12,r9,r12 + stw r12,8(r3) + subfe r30,r10,r30 + stw r30,12(r3) + subfe r31,r11,r31 + stwu r31,16(r3) + subfe r11,r11,r11 C invert ... + addic r11,r11,1 C ... carry + bdnz L(loopU) + + andi. r31,r5,3 + mtctr r31 + beq cr0,L(endx) + +L(loopE): + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + addc r8,r8,r7 + stwu r7,4(r3) + bdnz L(loopE) +L(endx): + addze r3,r0 + lwz r30,8(r1) + lwz r31,12(r1) + addi r1,r1,16 + blr +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/umul.asm b/gmp-6.3.0/mpn/powerpc32/umul.asm new file mode 100644 index 0000000..a5811e1 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/umul.asm @@ -0,0 +1,50 @@ +dnl PowerPC-32 umul_ppmm -- support for longlong.h + +dnl Copyright 2000, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + + C r3 lowptr + C r4 m1 + C r5 m2 + + mullw r0, r4, r5 + mulhwu r9, r4, r5 + stw r0, 0(r3) + mr r3, r9 + blr + +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm new file mode 100644 index 0000000..dee7266 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm @@ -0,0 +1,203 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyd) + +LIMB32(`slwi. r0, n, 2 ') +LIMB64(`sldi. r0, n, 3 ') + add rp, rp, r0 + add up, up, r0 + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, -4(up) ') +LIMB64(`ld r0, -8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`stw r0, -4(rp) ') +LIMB64(`std r0, -8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + addi rp, rp, -16 + addi up, up, -16 + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 12(up) ') +LIMB64(`ld r0, 8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 12(rp) ') +LIMB64(`std r0, 8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, -16 + + beq L(up_aligned) + + lvsl us, 0, up + + addi up, up, 16 +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v2, v0, us + stvx v3, 0, rp + addi up, up, -32 + addi rp, rp, -16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, -16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v0, v2, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, -32 + vperm v3, v2, v0, us + stvx v3, r10, rp + addi rp, rp, -32 + bdnz L(lpu) + + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, -16 + addi rp, rp, -16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, -32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, -32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 12 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 8(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 8(rp) ') +LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm new file mode 100644 index 0000000..992b468 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm @@ -0,0 +1,198 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyi) + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + or. r0, n, n + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subfic r7, r7, LIMBS_PER_VR + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + + beq L(up_aligned) + + lvsl us, 0, up + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v0, v2, us + stvx v3, 0, rp + addi up, up, 32 + addi rp, rp, 16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, 16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v2, v0, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, 32 + vperm v3, v0, v2, us + stvx v3, r10, rp + addi rp, rp, 32 + bdnz L(lpu) + + addi up, up, -16 + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, 16 + addi rp, rp, 16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, 32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, 32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 0 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 0(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm new file mode 100644 index 0000000..d656d3b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm @@ -0,0 +1,310 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, +dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise +dnl logical operations. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C and,ior,andn,nior,xor iorn,xnor nand +C cycles/limb cycles/limb cycles/limb +C 7400,7410 (G4): 1.39 ? ? +C 744x,745x (G4+): 1.14 1.39 1.39 +C 970: 1.7 2.0 2.0 + +C STATUS +C * Works for all sizes and alignment for 32-bit limbs. +C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. +C * Current performance makes this pointless for 970 + +C TODO +C * Might want to make variants when just one of the source operands needs +C vperm, and when neither needs it. The latter runs 50% faster on 7400. +C * Idea: If the source operands are equally aligned, we could do the logops +C first, then vperm before storing! That means we never need more than one +C vperm, ever! +C * Perhaps align `rp' after initial alignment loop? +C * Instead of having scalar code in the beginning and end, consider using +C read-modify-write vector code. +C * Software pipeline? Hopefully not too important, this is hairy enough +C already. +C * At least be more clever about operand loading, i.e., load v operands before +C u operands, since v operands are sometimes negated. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + +define(`vnegb', `') C default neg-before to null +define(`vnega', `') C default neg-before to null + +ifdef(`OPERATION_and_n', +` define(`func', `mpn_and_n') + define(`logopS',`and $1,$2,$3') + define(`logop', `vand $1,$2,$3')') +ifdef(`OPERATION_andn_n', +` define(`func', `mpn_andn_n') + define(`logopS',`andc $1,$2,$3') + define(`logop', `vandc $1,$2,$3')') +ifdef(`OPERATION_nand_n', +` define(`func', `mpn_nand_n') + define(`logopS',`nand $1,$2,$3') + define(`logop', `vand $1,$2,$3') + define(`vnega', `vnor $1,$2,$2')') +ifdef(`OPERATION_ior_n', +` define(`func', `mpn_ior_n') + define(`logopS',`or $1,$2,$3') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_iorn_n', +` define(`func', `mpn_iorn_n') + define(`logopS',`orc $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_nior_n', +` define(`func', `mpn_nior_n') + define(`logopS',`nor $1,$2,$3') + define(`logop', `vnor $1,$2,$3')') +ifdef(`OPERATION_xor_n', +` define(`func', `mpn_xor_n') + define(`logopS',`xor $1,$2,$3') + define(`logop', `vxor $1,$2,$3')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logopS',`eqv $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vxor $1,$2,$3')') + +ifelse(GMP_LIMB_BITS,`32',` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +define(`us', `v8') +define(`vs', `v9') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + +LIMB32(`cmpwi cr0, n, 8 ') +LIMB64(`cmpdi cr0, n, 4 ') + bge L(big) + + mtctr n + +LIMB32(`lwz r8, 0(up) ') +LIMB32(`lwz r9, 0(vp) ') +LIMB32(`logopS( r0, r8, r9) ') +LIMB32(`stw r0, 0(rp) ') +LIMB32(`bdz L(endS) ') + +L(topS): +LIMB32(`lwzu r8, 4(up) ') +LIMB64(`ld r8, 0(up) ') +LIMB64(`addi up, up, GMP_LIMB_BYTES ') +LIMB32(`lwzu r9, 4(vp) ') +LIMB64(`ld r9, 0(vp) ') +LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') + logopS( r0, r8, r9) +LIMB32(`stwu r0, 4(rp) ') +LIMB64(`std r0, 0(rp) ') +LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') + bdnz L(topS) +L(endS): + blr + +L(big): mfspr r12, 256 + oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME + mtspr 256, r0 + +C First loop until the destination is 16-byte aligned. This will execute 0 or 1 +C times for 64-bit machines, and 0 to 3 times for 32-bit machines. + +LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(aligned) + + subfic r7, r0, LIMBS_PER_VR +LIMB32(`li r10, 0 ') + subf n, r7, n +L(top0): +LIMB32(`lwz r8, 0(up) ') +LIMB64(`ld r8, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`lwz r9, 0(vp) ') +LIMB64(`ld r9, 0(vp) ') + addi vp, vp, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top0) ') + + addi rp, rp, 16 C update rp, but preserve its alignment + +L(aligned): +LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + lvsl us, 0, up + lvsl vs, 0, vp + + lvx v2, 0, up + lvx v3, 0, vp + bdnz L(gt1) + lvx v0, r10, up + lvx v1, r10, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 4 + b L(tail) + +L(gt1): addi up, up, 16 + addi vp, vp, 16 + +L(top): lvx v0, 0, up + lvx v1, 0, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + bdz L(end) + lvx v2, r10, up + lvx v3, r10, vp + vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + + andi. r0, up, 15 + vxor v0, v0, v0 + beq 1f + lvx v0, 0, up +1: andi. r0, vp, 15 + vxor v1, v1, v1 + beq 1f + lvx v1, 0, vp +1: vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi rp, rp, 4 + b L(tail) + +L(end): andi. r0, up, 15 + vxor v2, v2, v2 + beq 1f + lvx v2, r10, up +1: andi. r0, vp, 15 + vxor v3, v3, v3 + beq 1f + lvx v3, r10, vp +1: vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 20 + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) + addi rp, rp, 15 +LIMB32(`rlwinm rp, rp, 0,0,27 ') +LIMB64(`rldicr rp, rp, 0,59 ') + li r10, 0 +L(top2): +LIMB32(`lwzx r8, r10, up ') +LIMB64(`ldx r8, r10, up ') +LIMB32(`lwzx r9, r10, vp ') +LIMB64(`ldx r9, r10, vp ') +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() + +C This works for 64-bit PowerPC, since a limb ptr can only be aligned +C in 2 relevant ways, which means we can always find a pair of aligned +C pointers of rp, up, and vp. +C process words until rp is 16-byte aligned +C if (((up | vp) & 15) == 0) +C process with VMX without any vperm +C else if ((up & 15) != 0 && (vp & 15) != 0) +C process with VMX using vperm on store data +C else if ((up & 15) != 0) +C process with VMX using vperm on up data +C else +C process with VMX using vperm on vp data +C +C rlwinm, r0, up, 0,28,31 +C rlwinm r0, vp, 0,28,31 +C cmpwi cr7, r0, 0 +C cror cr6, cr0, cr7 +C crand cr0, cr0, cr7 diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm new file mode 100644 index 0000000..2bb11cd --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm @@ -0,0 +1,388 @@ +dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. + +dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C cycles/limb +C 603e: - +C 604e: - +C 75x (G3): - +C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 +C 744x,745x (G4+): 0.75 +C ppc970: 0.75 +C power4: - +C power5: - + +C TODO +C * Either start using the low-end masking constants, or remove them. +C * Merge multiple feed-in cases into a parameterized code block. +C * Reduce register usage. It should be possible to almost halve it. + +define(`up', `r3') +define(`n', `r4') + +define(`a0', `v3') +define(`a1', `v4') +define(`a2', `v5') +define(`c0', `v6') +define(`c1', `v7') +define(`c2', `v8') +define(`z', `v9') +define(`x0', `v10') +define(`x1', `v11') +define(`x2', `v12') +define(`x3', `v13') +define(`pv', `v14') +define(`y0', `v0') +define(`y1', `v1') +define(`y2', `v2') +define(`y3', `v15') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + cmpwi cr0, n, 20 C tuned cutoff point + bge L(large) + + li r9, 0 C result accumulator + mulli r10, n, 0xb C 0xb = ceil(32/3) + srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 + beq L(small_tail) + mtctr r10 + lwz r6, 0(up) + lwz r7, 4(up) + lwzu r8, 8(up) + subf n, r10, n + subf n, r10, n + subf n, r10, n + bdz L(small_end) + + ALIGN(16) +L(los): rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + lwz r6, 4(up) + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + lwz r7, 8(up) + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + lwzu r8, 12(up) + add r9, r9, r0 C add 24b from u2 + bdnz L(los) +L(small_end): + rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + add r9, r9, r0 C add 24b from u2 + + addi up, up, 4 + rlwinm r0, r9, 0,8,31 + srwi r9, r9, 24 + add r9, r9, r0 + +L(small_tail): + cmpi cr0, n, 1 + blt L(ret) + + lwz r6, 0(up) + rlwinm r0, r6, 0,8,31 + srwi r6, r6, 24 + add r9, r9, r0 + add r9, r9, r6 + + beq L(ret) + + lwz r6, 4(up) + rlwinm r0, r6, 8,8,23 + srwi r6, r6, 16 + add r9, r9, r0 + add r9, r9, r6 + +L(ret): mr r3, r9 + blr + + +L(large): + stwu r1, -32(r1) + mfspr r10, 256 + oris r0, r10, 0xffff C Set VRSAVE bit 0-15 + mtspr 256, r0 + + andi. r7, up, 15 + vxor a0, v0, v0 + lis r9, 0xaaaa + vxor a1, v0, v0 + ori r9, r9, 0xaaab + vxor a2, v0, v0 + li r5, 16 + vxor c0, v0, v0 + li r6, 32 + vxor c1, v0, v0 + LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin + vxor c2, v0, v0 + vxor z, v0, v0 + + beq L(aligned16) + + cmpwi cr7, r7, 8 + bge cr7, L(na4) + + lvx a2, 0, up + addi up, up, 16 + vsldoi a2, a2, z, 4 + vsldoi a2, z, a2, 12 + + addi n, n, 9 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(2) + +L(na4): bne cr7, L(na8) + + lvx a1, 0, up + addi up, up, -16 + vsldoi a1, a1, z, 8 + vsldoi a1, z, a1, 8 + + addi n, n, 6 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(1) + +L(na8): + lvx a0, 0, up + vsldoi a0, a0, z, 12 + vsldoi a0, z, a0, 4 + + addi n, n, 3 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(0) + +L(aligned16): + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + + lvx a0, 0, up +L(0): lvx a1, r5, up +L(1): lvx a2, r6, up + addi up, up, 48 +L(2): bdz L(end) + li r12, 256 + li r9, 288 + ALIGN(32) +L(top): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + dcbt up, r12 + dcbt up, r9 + addi up, up, 48 + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + bdnz L(top) + +L(end): +C n = 0...11 + cmpwi cr0, n, 0 + beq L(sum) + cmpwi cr0, n, 4 + ble L(tail.1..4) + cmpwi cr0, n, 8 + ble L(tail.5..8) + +L(tail.9..11): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v2, v2, v11 + + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + b L(sum) + +L(tail.5..8): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v1, v1, v11 + + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + b L(sum) + +L(tail.1..4): + lvx v0, 0, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v0, v0, v11 + + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + +L(sum): lvx pv, 0, r11 + vperm x0, a0, z, pv C extract 4 24-bit field from a0 + vperm y0, c2, z, pv + lvx pv, r5, r11 + vperm x1, a1, z, pv C extract 4 24-bit field from a1 + vperm y1, c0, z, pv C extract 4 24-bit field from a1 + lvx pv, r6, r11 + vperm x2, a2, z, pv C extract 4 24-bit field from a1 + vperm y2, c1, z, pv C extract 4 24-bit field from a1 + li r10, 48 + lvx pv, r10, r11 + vperm x3, a0, z, pv C extract remaining/partial a0 fields + vperm y3, c2, z, pv C extract remaining/partial a0 fields + li r10, 64 + lvx pv, r10, r11 + vperm x3, a1, x3, pv C insert remaining/partial a1 fields + vperm y3, c0, y3, pv C insert remaining/partial a1 fields + li r10, 80 + lvx pv, r10, r11 + vperm x3, a2, x3, pv C insert remaining/partial a2 fields + vperm y3, c1, y3, pv C insert remaining/partial a2 fields + +C We now have 4 128-bit accumulators to sum + vadduwm x0, x0, x1 + vadduwm x2, x2, x3 + vadduwm x0, x0, x2 + + vadduwm y0, y0, y1 + vadduwm y2, y2, y3 + vadduwm y0, y0, y2 + + vadduwm x0, x0, y0 + +C Reduce 32-bit fields + vsumsws x0, x0, z + + li r7, 16 + stvx x0, r7, r1 + lwz r3, 28(r1) + + mtspr 256, r10 + addi r1, r1, 32 + blr +EPILOGUE() + +C load | v0 | v1 | v2 | +C acc | a0 | a1 | a2 | +C carry | c0 | c1 | c2 | +C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 +C |---|---|---|---|---|---|---|---|---|---|---|---| 32 +C | | | | | | | | | | | | | | | | | 24 +C | | | | | | | | | 48 + +C $---------------$---------------$---------------$---------------$ +C | . . . . . . . . . . . . . . . | +C |_______________________________________________________________| +C | | | | | | | +C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> + + +DEF_OBJECT(cnsts,16) +C Permutation vectors in the order they are used above +C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f + .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 + .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 + .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 + .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 + .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 + .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number +C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(cnsts) diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm new file mode 100644 index 0000000..943c92d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm @@ -0,0 +1,34 @@ +dnl PowerPC-32/VMX mpn_popcount. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`powerpc64/vmx/popcount.asm') -- cgit v1.2.3