diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/coreibwl |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreibwl')
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm | 210 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h | 246 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm | 195 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm | 368 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm | 395 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm | 839 |
7 files changed, 2963 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm new file mode 100644 index 0000000..8d3a44a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm @@ -0,0 +1,210 @@ +dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 ? +C AMD zen1 ? +C AMD zen2 ? +C AMD zen3 1.5 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL 1.67 1.74 +C Intel SKL 1.63 1.71 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * For DOS64, fix nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_1) + FUNC_ENTRY(4) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + jmp L(b6) + +L(f1): mulx( (up), %r9, %rax) + jrcxz L(1) + jmp L(b1) +L(1): add (rp), %r9 + mov %r9, (rp) + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +ifdef(`PIC', +` nop;nop;nop;nop', +` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop') + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), n + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + jmp L(b7) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h new file mode 100644 index 0000000..91c91b5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h @@ -0,0 +1,246 @@ +/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */ +/* FFT tuning limit = 467,964,472 */ +/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 24 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 455 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 198 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39, 8}, \ + { 79,10}, { 23, 9}, { 55,11}, { 15,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 639,12}, { 1279,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8703,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95, 8}, { 1599, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 735,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2175,13}, \ + { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9471,14}, { 18943,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 215 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 80 +#define MULLO_MUL_N_THRESHOLD 11025 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 109 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 86 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 171 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_2_THRESHOLD 33 +#define REDC_2_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1866 + +#define POWM_SEC_TABLE 2,10,191,494,712,1378 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 644 +#define SET_STR_PRECOMPUTE_THRESHOLD 1658 + +#define FAC_DSC_THRESHOLD 562 +#define FAC_ODD_THRESHOLD 48 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 5 /* 0.38% faster than 3 */ +#define HGCD_THRESHOLD 73 +#define HGCD_APPR_THRESHOLD 67 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 630 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 29.65% faster than 4 */ + +/* Tuneup completed successfully, took 239050 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm new file mode 100644 index 0000000..b7fae2f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 1.70 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * Port to DOS64, not forgetting nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +dnl ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(b6) + +L(f7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(b7) + +L(f1): mulx( (up), %r9, %rax) + test n, n + jnz L(b1) +L(1): mov %r9, (rp) + ret + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + test n, n + jz L(end) + + ALIGN(32) +L(top): mov %r10, -8(rp) + adc %r8, %r9 +L(b1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(b0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(b7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(b6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(b5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(b4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(b3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec n + mulx( (up), %r9, %rax) + jnz L(top) + +L(end): mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm new file mode 100644 index 0000000..7ca5a9b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm @@ -0,0 +1,368 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`vp', `%r14') +define(`unneg', `%rbx') +define(`v0', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(gen): + push %rbx + push %rbp + push %r12 + push %r14 + + mov vp_param, vp + lea 1(un_param), unneg + mov un_param, n_save + mov R32(un_param), R32(%rax) + and $-8, unneg + shr $3, n_save C loop count + neg unneg + and $7, R32(%rax) C clear CF for adc as side-effect + C note that rax lives very long + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), w2, w3) + lea 56(up), up + lea -8(rp), rp + jmp L(mb0) + +L(mf3): mulx( (up), w0, w1) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(mb3) + +L(mf4): mulx( (up), w2, w3) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(mb4) + +L(mf5): mulx( (up), w0, w1) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(mb5) + +L(mf6): mulx( (up), w2, w3) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(mb6) + +L(mf7): mulx( (up), w0, w1) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(mb7) + +L(mf1): mulx( (up), w0, w1) + jmp L(mb1) + +L(mf2): mulx( (up), w2, w3) + lea 8(up), up + lea 8(rp), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(m1top): + mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up + mov w0, (rp) +L(mb0): mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec n + mulx( (up), w0, w1) + jnz L(m1top) + +L(m1end): + mov w2, -8(rp) + adc w3, w0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn + jz L(done) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + +L(outer): + lea (up,unneg,8), up + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + jmp *jaddr + +L(f0): mulx( 8,(up), w2, w3) + lea 8(rp,unneg,8), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( -16,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b3) + +L(f4): mulx( -24,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b4) + +L(f5): mulx( -32,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b5) + +L(f6): mulx( -40,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b6) + +L(f7): mulx( 16,(up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b7) + +L(f1): mulx( (up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b1) + +L(am1end): + adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn C clear OF as side-effect + jnz L(outer) +L(done): + pop %r14 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(f2): mulx( -8,(up), w2, w3) + lea 8(rp,unneg,8), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(am1top): + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(am1end) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea -1(n), n + mov w0, (rp) + adcx( w1, w2) +L(b0): mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) +L(b7): mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b6): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b5): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b4): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b3): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(am1top) + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) + JMPENT( L(mf7), L(mtab)) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm new file mode 100644 index 0000000..5cdb209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -0,0 +1,395 @@ +dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`jmpreg',`%rbx') +define(`nn', `%rbp') + +C TODO +C * Suppress more rp[] rewrites in corner. +C * Rearrange feed-in jumps for short branch forms. +C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since +C feed-in code implodes, the blow-up will not be more than perhaps 4x. +C * Micro-optimise critical lead-in code block around L(ent). +C * Write n < 4 code specifically for Broadwell (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r14 + push %r12 + push %rbx + push %rbp + mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end + imul (up), %r14 C FIXME Put at absolute end + lea -3(n), R32(nn) + lea 8(vp_param), vp + mov (vp_param), %rdx + + mov R32(n), R32(%rax) + shr $3, R32(n) + and $7, R32(%rax) C clear OF, CF as side-effect + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + jrcxz L(mc) + inc R32(n) + lea L(f2)(%rip), jmpreg + jmp L(mb3) + +L(mc): mulx( -8,(up), %r10, %r8) + add %rax, %r10 + mov %r9, -16(rp) + mulx( (up), %r9, %rax) + mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + jmp L(c2) + +L(mf4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx( (up), %r9, %rax) + lea L(f0)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + lea L(f1)(%rip), jmpreg + mulx( (up), %r9, %rax) + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(mtop):mov %r10, -8(rp) + adc %r8, %r9 +L(mb1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(mb0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(mb7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(mb6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(mb5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(mb4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(mb3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec R32(n) + mulx( (up), %r9, %rax) + jnz L(mtop) + +L(mend):mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + + lea 8(,nn,8), %r12 + neg %r12 + shr $3, R32(nn) + jmp L(ent) + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f1): mulx( (up), %r9, %rax) + lea -1(nn), R32(nn) + lea L(f0)(%rip), jmpreg + jmp L(b1) + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits + lea 8(%r12), %r12 +L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) + add %rax, %r14 + add %r10, %r14 C h + lea (up,%r12), up C reset up + lea 8(rp,%r12), rp C reset rp + mov (vp), %rdx + lea 8(vp), vp + or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) + jmp *jmpreg + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + lea L(f1)(%rip), jmpreg + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), R32(n) + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jrcxz L(cor) + lea L(f2)(%rip), jmpreg + jmp L(b3) + +L(cor): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) C FIXME suppress + adox( (rp), %r9) + mov %r9, (rp) C FIXME suppress + adox( %rcx, %rax) +L(c2): + mulx( 8,(up), %r10, %r8) + adc %rax, %r14 + add %r10, %r14 + mov (vp), %rdx + test R32(%rcx), R32(%rcx) + mulx( -16,(up), %r10, %r8) + mulx( -8,(up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + adox( (rp), %r9) + adox( %rcx, %rax) + adc %rcx, %rax + mulx( (up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov 8(vp), %rdx + mulx( -16,(up), %rcx, %rax) + add %r9, %rcx + mov %rcx, (rp) + adc $0, %rax + mulx( -8,(up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov %r14, 8(rp) + pop %rbp + pop %rbx + pop %r12 + pop %r14 + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..ff35124 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm @@ -0,0 +1,710 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell. + +dnl Copyright 2015, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zn1 ? ? +C AMD zn2 ? ? +C AMD zn3 ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * Reduce register use, i.e., by combining n_neg and n_save. +C * Supporess initial store through up, it's always a zero. +C * Streamline up and dp setup. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +dnl mp_limb_t +dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, +dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) + +define(`up', `%rdi') +define(`un', `%rsi') +define(`dp_param',`%rdx') +define(`dn_param',`%rcx') +define(`dinv', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`dp', `%r14') +define(`n_neg', `%rbx') +define(`q', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ifdef(`MAX_SPECIAL',,` +define(`MAX_SPECIAL', 8)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + lea L(atab)(%rip), %r10 + + cmp $MAX_SPECIAL, dn_param + jbe L(sma) + +ifelse(MAX_SPECIAL,8,,` +forloop(i,eval(MAX_SPECIAL+1),9,`L(i): +')') + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + + sub dn_param, un C outer loop count + + lea -8(,dn_param,8), n_neg + neg n_neg + mov dn_param, n_save + mov R32(dn_param), R32(%rax) + shr $3, n_save C loop count + and $7, R32(%rax) C clear CF and OF as side-effect + +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax,%r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + mov (up), q + imul dinv, q + jmp L(outer) + +L(f0): mulx( (dp), w2, w3) + lea -1(n), n + mulx( 8,(dp), w0, w1) + lea -8(dp), dp + adcx( w3, w0) + adox( (up), w2) + lea -8(up), up + jmp L(b0x) + +L(f3): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -48(up), up + lea 16(dp), dp + jmp L(b3x) + +L(f4): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 24(dp), dp + adox( (up), w2) + lea -40(up), up + adcx( w3, w0) + jmp L(b4x) + +L(f5): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 32(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -32(up), up + jmp L(b5x) + +L(f6): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 40(dp), dp + adox( (up), w2) + lea -24(up), up + adcx( w3, w0) + jmp L(b6x) + +L(f7): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 48(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -16(up), up + jmp L(b7x) + +L(f1): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + jmp L(b1x) + +L(f2): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 8(dp), dp + adox( (up), w2) + lea 8(up), up + adcx( w3, w0) + jmp L(b2x) + +L(end): adox( (up), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (up) + adc %rcx, w1 C relies on rcx = 0 + mov 8(up,n_neg), q C Compute next quotient early... + mulx( dinv, q, %r12) C ...(unused in last iteration) + bt $0, R32(%r13) + adc w1, 8(up) + setc R8(%r13) + dec un C clear OF as side-effect + jz L(done) + + lea (dp,n_neg), dp C reset dp to D[]'s beginning + lea 8(up,n_neg), up C point up to U[]'s current beginning +L(outer): + mov n_save, n + test %eax, %eax C clear CF and OF + jmp *jaddr + + ALIGN(16) +L(top): adox( -8,(up), w2) + adcx( w3, w0) + mov w2, -8(up) + jrcxz L(end) +L(b2x): mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + mov w0, (up) +L(b1x): adcx( w1, w2) + mulx( 16,(dp), w0, w1) + adcx( w3, w0) + adox( 8,(up), w2) + mov w2, 8(up) +L(b0x): mulx( 24,(dp), w2, w3) + lea 64(dp), dp + adcx( w1, w2) + adox( 16,(up), w0) + mov w0, 16(up) +L(b7x): mulx( -32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) +L(b6x): mulx( -24,(dp), w2, w3) + adcx( w1, w2) + adox( 32,(up), w0) + mov w0, 32(up) +L(b5x): mulx( -16,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) +L(b4x): adox( 48,(up), w0) + mulx( -8,(dp), w2, w3) + mov w0, 48(up) +L(b3x): lea 64(up), up + adcx( w1, w2) + mulx( (dp), w0, w1) + jmp L(top) + +L(done):mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(sma): +ifdef(`PIC', +` movslq 28(%r10,dn_param,4), %rax + lea (%rax,%r10), jaddr +',` + mov 56(%r10,dn_param,8), jaddr +') + jmp *jaddr + +L(1): mov (dp_param), %r10 + xor R32(%rax), R32(%rax) + mov (up), %rdx + dec un + mov %rdx, %r9 +L(o1): mulx( dinv, %rdx, %r11) C next quotient + lea 8(up), up + mulx( %r10, %rcx, %rdx) C 0 1 + add %r9, %rcx C 0 + adc %rax, %rdx C 1 + add (up), %rdx C 1 + setc R8(%rax) C 2 + mov %rdx, %r9 C 1 + dec un + jnz L(o1) + mov %r9, (up) + + FUNC_EXIT() + ret + +ifdef(`VER',,`define(`VER',1)') +L(2): push %r12 + push %r14 + + mov dp_param, dp C free up rdx + sub dn_param, un C loop count + mov (up), q + imul dinv, q + +ifelse(VER,0,` + xor R32(%rax), R32(%rax) +L(o2): test %eax, %eax C clear CF and OF + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + add (up), w2 C 0 + adc 8(up), %rdx C 1 + adc $0, w1 C 2 cannot carry further + add w3, %rdx C 1 + mov %rdx, 8(up) C 1 + adc $0, w1 C 2 + imul dinv, q C + bt $0, R32(%rax) + adc 16(up), w1 C 2 + mov w1, 16(up) + setc R8(%rax) + lea 8(up), up + dec un + jnz L(o2) +') +ifelse(VER,1,` + push %rbx + push %r13 + xor R32(%r13), R32(%r13) + mov (up), %rax + mov 8(up), %rbx +L(o2): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + adox( %rax, w2) C 0 + adcx( w3, %rdx) C 1 + adox( %rbx, %rdx) C 1 + adox( %rcx, w1) C 2 cannot carry further + mov %rdx, %rax C 1 + adc %rcx, w1 C 2 + imul dinv, q C + bt $0, R32(%r13) + adc 16(up), w1 C 2 + mov w1, %rbx + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o2) + + mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r13 + pop %rbx +') +ifelse(VER,2,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + mulx( 8,(dp), %rdx, %rcx) + add %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r10, %r12 C 0 add just to produce carry + adc %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') +ifelse(VER,3,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + add %r10, %r12 C 0 add just to produce carry + mulx( 8,(dp), %rdx, %rcx) + adc %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') + pop %r14 + pop %r12 + FUNC_EXIT() + ret + +ifelse(eval(MAX_SPECIAL>=3),1,` +L(3): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF + mulx( (dp), w0, w1) C 0 1 + adox( %rax, w0) C 0 + mulx( 8,(dp), %rax, w3) C 1 2 + adcx( w1, %rax) C 1 + adox( %rbx, %rax) C 1 + mulx( 16,(dp), %rbx, w1) C 2 3 + mov dinv, q C 1 + mulx( %rax, q, w0) + adcx( w3, %rbx) C 2 + adox( 16,(up), %rbx) C 2 + adox( %rcx, w1) C 3 + adc $0, w1 C 3 + bt $0, R32(%r13) + adc w1, 24(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o3) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=4),1,` +L(4): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o4): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + mov dinv, q + mulx( %rax, q, w2) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + adox( %rcx, w1) + mov w0, 24(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 32(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o4) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=5),1,` +L(5): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o5): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + adox( 16,(up), %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 24(up) + adox( 32,(up), w0) + adox( %rcx, w1) + mov w0, 32(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 40(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o5) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=6),1,` +L(6): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o6): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + mulx( 32,(dp), w2, w3) + mov w0, 24(up) + adcx( w1, w2) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 32(up) + adox( 40,(up), w0) + adox( %rcx, w1) + mov w0, 40(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 48(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o6) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=7),1,` +L(7): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o7): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + adox( 16,(up), %rbx) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) + adox( 32,(up), w0) + mulx( 40,(dp), w2, w3) + mov w0, 32(up) + adcx( w1, w2) + mulx( 48,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) + mov %rax, q + mulx( dinv, q, w2) + adox( 48,(up), w0) + adox( %rcx, w1) + mov w0, 48(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 56(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o7) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=8),1,` +L(8): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o8): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + mulx( 32,(dp), w2, w3) + adcx( w1, w2) + adox( 24,(up), w0) + mov w0, 24(up) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov w2, 32(up) + adox( 40,(up), w0) + mulx( 48,(dp), w2, w3) + mov w0, 40(up) + adcx( w1, w2) + mulx( 56,(dp), w0, w1) + adox( 48,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 48(up) + adox( 56,(up), w0) + adox( %rcx, w1) + mov w0, 56(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 64(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o8) + jmp L(esma) +') + +L(esma):mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret + + + JUMPTABSECT + ALIGN(8) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(1), L(atab)) + JMPENT( L(2), L(atab)) + JMPENT( L(3), L(atab)) + JMPENT( L(4), L(atab)) + JMPENT( L(5), L(atab)) + JMPENT( L(6), L(atab)) + JMPENT( L(7), L(atab)) + JMPENT( L(8), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm new file mode 100644 index 0000000..e81b01b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm @@ -0,0 +1,839 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * We have 8 addmul_1 loops which fall into each other. The idea is to save +C on switching code, since a circularly updated computed goto target will +C hardly allow correct branch prediction. On 2nd thought, we now might make +C each of the 8 loop branches be poorly predicted since they will be +C executed fewer times for each time. With just one addmul_1 loop, the loop +C count will change only once each 8th time. +C * Do overlapped software pipelining. +C * Perhaps load in shrx/sarx, eliminating separate load insn. +C * Schedule add+stored in small n code. +C * Try swapping adox and adcx insn, making mulx have more time to run. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`n', `%rcx') +define(`un_save', `%rbx') +define(`u0', `%rdx') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %rbx + + lea -3(un_param), R32(un_save) + lea 5(un_param), R32(n) + mov R32(un_param), R32(%rax) + and $-8, R32(un_save) + shr $3, R32(n) C count for mul_1 loop + neg un_save C 8*count and offert for addmul_1 loops + and $7, R32(%rax) C clear CF for adc as side-effect + + mov (up), u0 + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + lea 64(up), up + add w1, w2 + jmp L(mb0) + +L(mf3): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mov w2, (rp) + mulx( 8,(up), w0, w1) + lea 24(up), up + lea 24(rp), rp + add w3, w0 + jmp L(mb3) + +L(mf4): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 32(up), up + lea 32(rp), rp + add w1, w2 + jmp L(mb4) + +L(mf5): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 40(up), up + lea 40(rp), rp + add w3, w0 + jmp L(mb5) + +L(mf6): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 48(up), up + lea 48(rp), rp + add w1, w2 + jmp L(mb6) + +L(mf7): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 56(up), up + lea 56(rp), rp + add w3, w0 + jmp L(mb7) + +L(mf1): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 8(up), up + lea 8(rp), rp + add w3, w0 + jmp L(mb1) + +L(mf2): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 16(up), up + lea 16(rp), rp + dec R32(n) + add w1, w2 + mulx( (up), w0, w1) + + ALIGN(16) +L(top): mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up +L(mb0): mov w0, (rp) + mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec R32(n) + mulx( (up), w0, w1) + jnz L(top) + +L(end): mov w2, -8(rp) + adc w3, w0 +C mov w0, (rp) +C adc %rcx, w1 +C mov w1, 8(rp) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r11 +',` + mov (%r10,%rax,8), %r11 +') + mov $63, R32(%rax) + jmp *%r11 + +L(ed0): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f7): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov (up), w1 C up[-1] + mov 8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + jmp L(b7) + + ALIGN(16) +L(tp0): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) +L(b0): mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp0) + +L(ed1): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f0): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + jmp L(b0) + + ALIGN(16) +L(tp1): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed1) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp1) + +L(ed2): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f1): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea 8(un_save), un_save + lea -56(rp,un_save,8), rp + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) C FIXME: crossjump? + mulx( (up), w0, w1) + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jmp L(b1) + + ALIGN(16) +L(tp2): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed2) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b2): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp2) + +L(ed3): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f2): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + or R32(un_save), R32(n) + jz L(cor3) + lea -56(rp,un_save,8), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b2) + + ALIGN(16) +L(tp3): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed3) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b3): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp3) + +L(ed4): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f3): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -32(up), w1 C up[-1] + mov -24(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b3) + + ALIGN(16) +L(tp4): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed4) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b4): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp4) + +L(ed5): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f4): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -40(up), w3 C up[-1] + mov -32(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b4) + + ALIGN(16) +L(tp5): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed5) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b5): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp5) + +L(ed6): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f5): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -48(up), w1 C up[-1] + mov -40(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b5) + + ALIGN(16) +L(tp6): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed6) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up +L(b6): adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp6) + +L(ed7): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f6): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -56(up), w3 C up[-1] + mov -48(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( -40,(up), w2, w3) + jmp L(b6) + + ALIGN(16) +L(tp7): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed7) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) +L(b7): adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp7) + +L(cor3):lea -64(rp), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + adox( 56,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 56(rp) + adcx( w1, w2) + mulx( (up), %rbx, w1) + adox( 64,(rp), w2) + adcx( w3, %rbx) + mov w2, 64(rp) + adox( 72,(rp), %rbx) + adox( %rcx, w1) C relies on rcx = 0 + adc %rcx, w1 C relies on rcx = 0 + mov w1, 80(rp) C FIXME +C wd2 + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + mulx( (up), w0, %rax) + adox( %rbx, w2) + adcx( w3, w0) + mov w2, 72(rp) + adox( 80,(rp), w0) + adox( %rcx, %rax) C relies on rcx = 0 + mov w0, 80(rp) + adc %rcx, %rax C relies on rcx = 0 +C wd1 + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + sar $63, w3 + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + adcx( w3, w0) + adox( %rax, w0) + mov w0, 88(rp) + adcx( %rcx, w1) + adox( %rcx, w1) + mov w1, 96(rp) + + pop %rbx + FUNC_EXIT() + ret + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) +L(atab):JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + TEXT +EPILOGUE() |