aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/coreibwl
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreibwl')
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm210
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h246
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm368
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm395
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm710
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm839
7 files changed, 2963 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
new file mode 100644
index 0000000..8d3a44a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
@@ -0,0 +1,210 @@
+dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 n/a
+C AMD bd3 n/a
+C AMD bd4 ?
+C AMD zen1 ?
+C AMD zen2 ?
+C AMD zen3 1.5
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL 1.67 1.74
+C Intel SKL 1.63 1.71
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Put an initial mulx before switching, targeting some free registers.
+C * Tune feed-in code.
+C * Trim nop execution after L(f2).
+C * For DOS64, fix nop execution.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(` define(`up', ``%rsi'') ') dnl
+dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
+dnl IFDOS(` define(`vl', ``%r9'') ') dnl
+dnl IFDOS(` define(`r9', ``rdi'') ') dnl
+dnl IFDOS(` define(`n', ``%r8'') ') dnl
+dnl IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_1)
+ FUNC_ENTRY(4)
+
+ mov v0_param, %r10
+ mov n_param, n
+ mov R32(n_param), R32(%r8)
+ shr $3, n
+ and $7, R32(%r8) C clear OF, CF as side-effect
+ mov %r10, %rdx
+ lea L(tab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%r8,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%r8,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(f0), L(tab))
+ JMPENT( L(f1), L(tab))
+ JMPENT( L(f2), L(tab))
+ JMPENT( L(f3), L(tab))
+ JMPENT( L(f4), L(tab))
+ JMPENT( L(f5), L(tab))
+ JMPENT( L(f6), L(tab))
+ JMPENT( L(f7), L(tab))
+ TEXT
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea -1(n), n
+ jmp L(b0)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea -48(rp), rp
+ jmp L(b3)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea -40(rp), rp
+ jmp L(b4)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea -32(rp), rp
+ jmp L(b5)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea -24(rp), rp
+ jmp L(b6)
+
+L(f1): mulx( (up), %r9, %rax)
+ jrcxz L(1)
+ jmp L(b1)
+L(1): add (rp), %r9
+ mov %r9, (rp)
+ adc %rcx, %rax C relies on rcx = 0
+ FUNC_EXIT()
+ ret
+
+L(end): adox( (rp), %r9)
+ mov %r9, (rp)
+ adox( %rcx, %rax) C relies on rcx = 0
+ adc %rcx, %rax C relies on rcx = 0
+ FUNC_EXIT()
+ ret
+
+ifdef(`PIC',
+` nop;nop;nop;nop',
+` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+
+ ALIGN(32)
+L(top): adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ jrcxz L(end)
+L(b1): mulx( 8,(up), %r10, %r8)
+ adox( (rp), %r9)
+ lea -1(n), n
+ mov %r9, (rp)
+ adcx( %rax, %r10)
+L(b0): mulx( 16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ adox( 8,(rp), %r10)
+ mov %r10, 8(rp)
+L(b7): mulx( 24,(up), %r10, %r8)
+ lea 64(up), up
+ adcx( %rax, %r10)
+ adox( 16,(rp), %r9)
+ mov %r9, 16(rp)
+L(b6): mulx( -32,(up), %r9, %rax)
+ adox( 24,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 24(rp)
+L(b5): mulx( -24,(up), %r10, %r8)
+ adcx( %rax, %r10)
+ adox( 32,(rp), %r9)
+ mov %r9, 32(rp)
+L(b4): mulx( -16,(up), %r9, %rax)
+ adox( 40,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 40(rp)
+L(b3): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ jmp L(top)
+
+L(f7): mulx( (up), %r9, %rax)
+ lea -16(up), up
+ lea -16(rp), rp
+ jmp L(b7)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..91c91b5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
@@ -0,0 +1,246 @@
+/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 467,964,472 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 24
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 455
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 202
+#define MUL_TOOM6H_THRESHOLD 303
+#define MUL_TOOM8H_THRESHOLD 406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 198
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 117
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 426
+#define SQR_TOOM8_THRESHOLD 547
+
+#define MULMID_TOOM42_THRESHOLD 46
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 18
+
+#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39, 8}, \
+ { 79,10}, { 23, 9}, { 55,11}, { 15,10}, \
+ { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \
+ { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \
+ { 87,11}, { 47,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \
+ { 95,10}, { 199,11}, { 111,12}, { 63, 8}, \
+ { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 639,12}, { 1279,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \
+ { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \
+ { 2687,13}, { 1407,14}, { 767,13}, { 1535,12}, \
+ { 3071,13}, { 1599,12}, { 3199,13}, { 1663,14}, \
+ { 895,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,12}, { 6911,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \
+ { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \
+ { 2559,14}, { 5247,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \
+ { 8703,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,17}, { 2047,16}, { 4095,15}, \
+ { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 400, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 79,11}, { 47,10}, { 95,12}, \
+ { 31,11}, { 63,10}, { 127,11}, { 79,10}, \
+ { 159,11}, { 95,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \
+ { 575,10}, { 303,11}, { 159,10}, { 319,12}, \
+ { 95, 8}, { 1599, 9}, { 831,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,11}, { 367,10}, { 735,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,12}, { 287,11}, { 607,12}, \
+ { 319,11}, { 671,12}, { 351,11}, { 735,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 799,13}, \
+ { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \
+ { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2047,13}, { 4095,14}, { 2175,13}, \
+ { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4351,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \
+ { 9471,14}, { 18943,15}, { 9983,14}, { 19967,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 215
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 80
+#define MULLO_MUL_N_THRESHOLD 11025
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 109
+#define SQRLO_SQR_THRESHOLD 7293
+
+#define DC_DIV_QR_THRESHOLD 54
+#define DC_DIVAPPR_Q_THRESHOLD 183
+#define DC_BDIV_QR_THRESHOLD 86
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 58
+#define INV_NEWTON_THRESHOLD 171
+#define INV_APPR_THRESHOLD 171
+
+#define BINV_NEWTON_THRESHOLD 292
+#define REDC_1_TO_REDC_2_THRESHOLD 33
+#define REDC_2_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 67
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1866
+
+#define POWM_SEC_TABLE 2,10,191,494,712,1378
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 644
+#define SET_STR_PRECOMPUTE_THRESHOLD 1658
+
+#define FAC_DSC_THRESHOLD 562
+#define FAC_ODD_THRESHOLD 48
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD2_DIV1_METHOD 5 /* 0.38% faster than 3 */
+#define HGCD_THRESHOLD 73
+#define HGCD_APPR_THRESHOLD 67
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 630
+#define GCDEXT_DC_THRESHOLD 365
+#define JACOBI_BASE_METHOD 1 /* 29.65% faster than 4 */
+
+/* Tuneup completed successfully, took 239050 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
new file mode 100644
index 0000000..b7fae2f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_mul_1 optimised for Intel Broadwell.
+
+dnl Copyright 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bull -
+C AMD pile -
+C AMD steam -
+C AMD excavator -
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel core2 -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL 1.70
+C Intel BWL 1.51
+C Intel SKL 1.52
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Put an initial mulx before switching, targeting some free registers.
+C * Tune feed-in code.
+C * Trim nop execution after L(f2).
+C * Port to DOS64, not forgetting nop execution.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+
+dnl ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(` define(`up', ``%rsi'') ') dnl
+dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
+dnl IFDOS(` define(`vl', ``%r9'') ') dnl
+dnl IFDOS(` define(`r9', ``rdi'') ') dnl
+dnl IFDOS(` define(`n', ``%r8'') ') dnl
+dnl IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_1)
+
+ mov v0_param, %r10
+ mov n_param, n
+ mov R32(n_param), R32(%r8)
+ shr $3, n
+ and $7, R32(%r8) C clear OF, CF as side-effect
+ mov %r10, %rdx
+ lea L(tab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%r8,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%r8,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(f0), L(tab))
+ JMPENT( L(f1), L(tab))
+ JMPENT( L(f2), L(tab))
+ JMPENT( L(f3), L(tab))
+ JMPENT( L(f4), L(tab))
+ JMPENT( L(f5), L(tab))
+ JMPENT( L(f6), L(tab))
+ JMPENT( L(f7), L(tab))
+ TEXT
+
+L(f0): mulx( (up), %r10, %r8)
+ lea 56(up), up
+ lea -8(rp), rp
+ jmp L(b0)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea 16(rp), rp
+ inc n
+ jmp L(b3)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc n
+ jmp L(b4)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc n
+ jmp L(b5)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc n
+ jmp L(b6)
+
+L(f7): mulx( (up), %r9, %rax)
+ lea 48(up), up
+ lea 48(rp), rp
+ inc n
+ jmp L(b7)
+
+L(f1): mulx( (up), %r9, %rax)
+ test n, n
+ jnz L(b1)
+L(1): mov %r9, (rp)
+ ret
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+ test n, n
+ jz L(end)
+
+ ALIGN(32)
+L(top): mov %r10, -8(rp)
+ adc %r8, %r9
+L(b1): mulx( 8,(up), %r10, %r8)
+ adc %rax, %r10
+ lea 64(up), up
+ mov %r9, (rp)
+L(b0): mov %r10, 8(rp)
+ mulx( -48,(up), %r9, %rax)
+ lea 64(rp), rp
+ adc %r8, %r9
+L(b7): mulx( -40,(up), %r10, %r8)
+ mov %r9, -48(rp)
+ adc %rax, %r10
+L(b6): mov %r10, -40(rp)
+ mulx( -32,(up), %r9, %rax)
+ adc %r8, %r9
+L(b5): mulx( -24,(up), %r10, %r8)
+ mov %r9, -32(rp)
+ adc %rax, %r10
+L(b4): mulx( -16,(up), %r9, %rax)
+ mov %r10, -24(rp)
+ adc %r8, %r9
+L(b3): mulx( -8,(up), %r10, %r8)
+ adc %rax, %r10
+ mov %r9, -16(rp)
+ dec n
+ mulx( (up), %r9, %rax)
+ jnz L(top)
+
+L(end): mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ adc %rcx, %rax
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
new file mode 100644
index 0000000..7ca5a9b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
@@ -0,0 +1,368 @@
+dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell.
+
+dnl Copyright 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zen ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`vp', `%r14')
+define(`unneg', `%rbx')
+define(`v0', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ cmp $2, un_param
+ ja L(gen)
+ mov (vp_param), %rdx
+ mulx( (up), %rax, %r9) C 0 1
+ je L(s2x)
+
+L(s11): mov %rax, (rp)
+ mov %r9, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(s2x): cmp $2, vn
+ mulx( 8,(up), %r8, %r10) C 1 2
+ je L(s22)
+
+L(s21): add %r8, %r9
+ adc $0, %r10
+ mov %rax, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(s22): add %r8, %r9 C 1
+ adc $0, %r10 C 2
+ mov 8(vp_param), %rdx
+ mov %rax, (rp)
+ mulx( (up), %r8, %r11) C 1 2
+ mulx( 8,(up), %rax, %rdx) C 2 3
+ add %r11, %rax C 2
+ adc $0, %rdx C 3
+ add %r8, %r9 C 1
+ adc %rax, %r10 C 2
+ adc $0, %rdx C 3
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(gen):
+ push %rbx
+ push %rbp
+ push %r12
+ push %r14
+
+ mov vp_param, vp
+ lea 1(un_param), unneg
+ mov un_param, n_save
+ mov R32(un_param), R32(%rax)
+ and $-8, unneg
+ shr $3, n_save C loop count
+ neg unneg
+ and $7, R32(%rax) C clear CF for adc as side-effect
+ C note that rax lives very long
+ mov n_save, n
+ mov (vp), v0
+ lea 8(vp), vp
+
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r11
+ lea (%r11, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( (up), w2, w3)
+ lea 56(up), up
+ lea -8(rp), rp
+ jmp L(mb0)
+
+L(mf3): mulx( (up), w0, w1)
+ lea 16(up), up
+ lea 16(rp), rp
+ inc n
+ jmp L(mb3)
+
+L(mf4): mulx( (up), w2, w3)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc n
+ jmp L(mb4)
+
+L(mf5): mulx( (up), w0, w1)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc n
+ jmp L(mb5)
+
+L(mf6): mulx( (up), w2, w3)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc n
+ jmp L(mb6)
+
+L(mf7): mulx( (up), w0, w1)
+ lea 48(up), up
+ lea 48(rp), rp
+ inc n
+ jmp L(mb7)
+
+L(mf1): mulx( (up), w0, w1)
+ jmp L(mb1)
+
+L(mf2): mulx( (up), w2, w3)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(m1top):
+ mov w2, -8(rp)
+ adc w3, w0
+L(mb1): mulx( 8,(up), w2, w3)
+ adc w1, w2
+ lea 64(up), up
+ mov w0, (rp)
+L(mb0): mov w2, 8(rp)
+ mulx( -48,(up), w0, w1)
+ lea 64(rp), rp
+ adc w3, w0
+L(mb7): mulx( -40,(up), w2, w3)
+ mov w0, -48(rp)
+ adc w1, w2
+L(mb6): mov w2, -40(rp)
+ mulx( -32,(up), w0, w1)
+ adc w3, w0
+L(mb5): mulx( -24,(up), w2, w3)
+ mov w0, -32(rp)
+ adc w1, w2
+L(mb4): mulx( -16,(up), w0, w1)
+ mov w2, -24(rp)
+ adc w3, w0
+L(mb3): mulx( -8,(up), w2, w3)
+ adc w1, w2
+ mov w0, -16(rp)
+ dec n
+ mulx( (up), w0, w1)
+ jnz L(m1top)
+
+L(m1end):
+ mov w2, -8(rp)
+ adc w3, w0
+ mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+
+ dec vn
+ jz L(done)
+
+ lea L(atab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+
+L(outer):
+ lea (up,unneg,8), up
+ mov n_save, n
+ mov (vp), v0
+ lea 8(vp), vp
+ jmp *jaddr
+
+L(f0): mulx( 8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
+ lea -1(n), n
+ jmp L(b0)
+
+L(f3): mulx( -16,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
+ jmp L(b3)
+
+L(f4): mulx( -24,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
+ jmp L(b4)
+
+L(f5): mulx( -32,(up), w0, w1)
+ lea -56(rp,unneg,8), rp
+ jmp L(b5)
+
+L(f6): mulx( -40,(up), w2, w3)
+ lea -56(rp,unneg,8), rp
+ jmp L(b6)
+
+L(f7): mulx( 16,(up), w0, w1)
+ lea 8(rp,unneg,8), rp
+ jmp L(b7)
+
+L(f1): mulx( (up), w0, w1)
+ lea 8(rp,unneg,8), rp
+ jmp L(b1)
+
+L(am1end):
+ adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+
+ dec vn C clear OF as side-effect
+ jnz L(outer)
+L(done):
+ pop %r14
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(f2): mulx( -8,(up), w2, w3)
+ lea 8(rp,unneg,8), rp
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(am1top):
+ adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(am1end)
+L(b1): mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea -1(n), n
+ mov w0, (rp)
+ adcx( w1, w2)
+L(b0): mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+L(b7): mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+L(b6): mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+L(b5): mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+L(b4): mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+L(b3): adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(am1top)
+
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
+ JMPENT( L(mf7), L(mtab))
+L(atab):JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
new file mode 100644
index 0000000..5cdb209
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
@@ -0,0 +1,395 @@
+dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`jmpreg',`%rbx')
+define(`nn', `%rbp')
+
+C TODO
+C * Suppress more rp[] rewrites in corner.
+C * Rearrange feed-in jumps for short branch forms.
+C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since
+C feed-in code implodes, the blow-up will not be more than perhaps 4x.
+C * Micro-optimise critical lead-in code block around L(ent).
+C * Write n < 4 code specifically for Broadwell (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, R32(n)
+ jae L(big)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $2, R32(n)
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r8
+ mov (up), %rdx
+ mulx( %r8, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r8 C u1 x v1
+ add %r8, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(big): push %r14
+ push %r12
+ push %rbx
+ push %rbp
+ mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end
+ imul (up), %r14 C FIXME Put at absolute end
+ lea -3(n), R32(nn)
+ lea 8(vp_param), vp
+ mov (vp_param), %rdx
+
+ mov R32(n), R32(%rax)
+ shr $3, R32(n)
+ and $7, R32(%rax) C clear OF, CF as side-effect
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( (up), %r10, %r8)
+ lea 56(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(mb0)
+
+L(mf3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea 16(rp), rp
+ jrcxz L(mc)
+ inc R32(n)
+ lea L(f2)(%rip), jmpreg
+ jmp L(mb3)
+
+L(mc): mulx( -8,(up), %r10, %r8)
+ add %rax, %r10
+ mov %r9, -16(rp)
+ mulx( (up), %r9, %rax)
+ mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ jmp L(c2)
+
+L(mf4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea 24(rp), rp
+ inc R32(n)
+ lea L(f3)(%rip), jmpreg
+ jmp L(mb4)
+
+L(mf5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea 32(rp), rp
+ inc R32(n)
+ lea L(f4)(%rip), jmpreg
+ jmp L(mb5)
+
+L(mf6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea 40(rp), rp
+ inc R32(n)
+ lea L(f5)(%rip), jmpreg
+ jmp L(mb6)
+
+L(mf7): mulx( (up), %r9, %rax)
+ lea 48(up), up
+ lea 48(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(mb7)
+
+L(mf1): mulx( (up), %r9, %rax)
+ lea L(f0)(%rip), jmpreg
+ jmp L(mb1)
+
+L(mf2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ lea L(f1)(%rip), jmpreg
+ mulx( (up), %r9, %rax)
+
+C FIXME ugly fallthrough FIXME
+ ALIGN(32)
+L(mtop):mov %r10, -8(rp)
+ adc %r8, %r9
+L(mb1): mulx( 8,(up), %r10, %r8)
+ adc %rax, %r10
+ lea 64(up), up
+ mov %r9, (rp)
+L(mb0): mov %r10, 8(rp)
+ mulx( -48,(up), %r9, %rax)
+ lea 64(rp), rp
+ adc %r8, %r9
+L(mb7): mulx( -40,(up), %r10, %r8)
+ mov %r9, -48(rp)
+ adc %rax, %r10
+L(mb6): mov %r10, -40(rp)
+ mulx( -32,(up), %r9, %rax)
+ adc %r8, %r9
+L(mb5): mulx( -24,(up), %r10, %r8)
+ mov %r9, -32(rp)
+ adc %rax, %r10
+L(mb4): mulx( -16,(up), %r9, %rax)
+ mov %r10, -24(rp)
+ adc %r8, %r9
+L(mb3): mulx( -8,(up), %r10, %r8)
+ adc %rax, %r10
+ mov %r9, -16(rp)
+ dec R32(n)
+ mulx( (up), %r9, %rax)
+ jnz L(mtop)
+
+L(mend):mov %r10, -8(rp)
+ adc %r8, %r9
+ mov %r9, (rp)
+ adc %rcx, %rax
+
+ lea 8(,nn,8), %r12
+ neg %r12
+ shr $3, R32(nn)
+ jmp L(ent)
+
+L(f0): mulx( (up), %r10, %r8)
+ lea -8(up), up
+ lea -8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(b0)
+
+L(f1): mulx( (up), %r9, %rax)
+ lea -1(nn), R32(nn)
+ lea L(f0)(%rip), jmpreg
+ jmp L(b1)
+
+L(end): adox( (rp), %r9)
+ mov %r9, (rp)
+ adox( %rcx, %rax) C relies on rcx = 0
+ adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits
+ lea 8(%r12), %r12
+L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?)
+ add %rax, %r14
+ add %r10, %r14 C h
+ lea (up,%r12), up C reset up
+ lea 8(rp,%r12), rp C reset rp
+ mov (vp), %rdx
+ lea 8(vp), vp
+ or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior)
+ jmp *jmpreg
+
+L(f7): mulx( (up), %r9, %rax)
+ lea -16(up), up
+ lea -16(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(b7)
+
+L(f2): mulx( (up), %r10, %r8)
+ lea 8(up), up
+ lea 8(rp), rp
+ mulx( (up), %r9, %rax)
+ lea L(f1)(%rip), jmpreg
+
+C FIXME ugly fallthrough FIXME
+ ALIGN(32)
+L(top): adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ jrcxz L(end)
+L(b1): mulx( 8,(up), %r10, %r8)
+ adox( (rp), %r9)
+ lea -1(n), R32(n)
+ mov %r9, (rp)
+ adcx( %rax, %r10)
+L(b0): mulx( 16,(up), %r9, %rax)
+ adcx( %r8, %r9)
+ adox( 8,(rp), %r10)
+ mov %r10, 8(rp)
+L(b7): mulx( 24,(up), %r10, %r8)
+ lea 64(up), up
+ adcx( %rax, %r10)
+ adox( 16,(rp), %r9)
+ mov %r9, 16(rp)
+L(b6): mulx( -32,(up), %r9, %rax)
+ adox( 24,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 24(rp)
+L(b5): mulx( -24,(up), %r10, %r8)
+ adcx( %rax, %r10)
+ adox( 32,(rp), %r9)
+ mov %r9, 32(rp)
+L(b4): mulx( -16,(up), %r9, %rax)
+ adox( 40,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, 40(rp)
+L(b3): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ jmp L(top)
+
+L(f6): mulx( (up), %r10, %r8)
+ lea 40(up), up
+ lea -24(rp), rp
+ lea L(f5)(%rip), jmpreg
+ jmp L(b6)
+
+L(f5): mulx( (up), %r9, %rax)
+ lea 32(up), up
+ lea -32(rp), rp
+ lea L(f4)(%rip), jmpreg
+ jmp L(b5)
+
+L(f4): mulx( (up), %r10, %r8)
+ lea 24(up), up
+ lea -40(rp), rp
+ lea L(f3)(%rip), jmpreg
+ jmp L(b4)
+
+L(f3): mulx( (up), %r9, %rax)
+ lea 16(up), up
+ lea -48(rp), rp
+ jrcxz L(cor)
+ lea L(f2)(%rip), jmpreg
+ jmp L(b3)
+
+L(cor): adox( 48,(rp), %r9)
+ mulx( -8,(up), %r10, %r8)
+ mov %r9, 48(rp)
+ lea 64(rp), rp
+ adcx( %rax, %r10)
+ mulx( (up), %r9, %rax)
+ adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp) C FIXME suppress
+ adox( (rp), %r9)
+ mov %r9, (rp) C FIXME suppress
+ adox( %rcx, %rax)
+L(c2):
+ mulx( 8,(up), %r10, %r8)
+ adc %rax, %r14
+ add %r10, %r14
+ mov (vp), %rdx
+ test R32(%rcx), R32(%rcx)
+ mulx( -16,(up), %r10, %r8)
+ mulx( -8,(up), %r9, %rax)
+ adox( -8,(rp), %r10)
+ adcx( %r8, %r9)
+ mov %r10, -8(rp)
+ adox( (rp), %r9)
+ adox( %rcx, %rax)
+ adc %rcx, %rax
+ mulx( (up), %r10, %r8)
+ add %rax, %r14
+ add %r10, %r14
+ mov 8(vp), %rdx
+ mulx( -16,(up), %rcx, %rax)
+ add %r9, %rcx
+ mov %rcx, (rp)
+ adc $0, %rax
+ mulx( -8,(up), %r10, %r8)
+ add %rax, %r14
+ add %r10, %r14
+ mov %r14, 8(rp)
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r14
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf7), L(mtab))
+ JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..ff35124
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
@@ -0,0 +1,710 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C AMD zn3 ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Do overlapped software pipelining.
+C * Reduce register use, i.e., by combining n_neg and n_save.
+C * Supporess initial store through up, it's always a zero.
+C * Streamline up and dp setup.
+C * When changing this, make sure the code which falls into the inner loops
+C does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl mp_limb_t
+dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up', `%rdi')
+define(`un', `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv', `%r8')
+
+define(`n', `%rcx')
+define(`n_save', `%rbp')
+define(`dp', `%r14')
+define(`n_neg', `%rbx')
+define(`q', `%rdx')
+define(`jaddr', `%rax')
+
+define(`w0', `%r12')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+
+ lea L(atab)(%rip), %r10
+
+ cmp $MAX_SPECIAL, dn_param
+ jbe L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+
+ sub dn_param, un C outer loop count
+
+ lea -8(,dn_param,8), n_neg
+ neg n_neg
+ mov dn_param, n_save
+ mov R32(dn_param), R32(%rax)
+ shr $3, n_save C loop count
+ and $7, R32(%rax) C clear CF and OF as side-effect
+
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov (%r10,%rax,8), jaddr
+')
+ mov (up), q
+ imul dinv, q
+ jmp L(outer)
+
+L(f0): mulx( (dp), w2, w3)
+ lea -1(n), n
+ mulx( 8,(dp), w0, w1)
+ lea -8(dp), dp
+ adcx( w3, w0)
+ adox( (up), w2)
+ lea -8(up), up
+ jmp L(b0x)
+
+L(f3): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -48(up), up
+ lea 16(dp), dp
+ jmp L(b3x)
+
+L(f4): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 24(dp), dp
+ adox( (up), w2)
+ lea -40(up), up
+ adcx( w3, w0)
+ jmp L(b4x)
+
+L(f5): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 32(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -32(up), up
+ jmp L(b5x)
+
+L(f6): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 40(dp), dp
+ adox( (up), w2)
+ lea -24(up), up
+ adcx( w3, w0)
+ jmp L(b6x)
+
+L(f7): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ lea 48(dp), dp
+ adcx( w1, w2)
+ adox( (up), w0)
+ lea -16(up), up
+ jmp L(b7x)
+
+L(f1): mulx( (dp), w0, w1)
+ mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ jmp L(b1x)
+
+L(f2): mulx( (dp), w2, w3)
+ mulx( 8,(dp), w0, w1)
+ lea 8(dp), dp
+ adox( (up), w2)
+ lea 8(up), up
+ adcx( w3, w0)
+ jmp L(b2x)
+
+L(end): adox( (up), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+ mov w0, (up)
+ adc %rcx, w1 C relies on rcx = 0
+ mov 8(up,n_neg), q C Compute next quotient early...
+ mulx( dinv, q, %r12) C ...(unused in last iteration)
+ bt $0, R32(%r13)
+ adc w1, 8(up)
+ setc R8(%r13)
+ dec un C clear OF as side-effect
+ jz L(done)
+
+ lea (dp,n_neg), dp C reset dp to D[]'s beginning
+ lea 8(up,n_neg), up C point up to U[]'s current beginning
+L(outer):
+ mov n_save, n
+ test %eax, %eax C clear CF and OF
+ jmp *jaddr
+
+ ALIGN(16)
+L(top): adox( -8,(up), w2)
+ adcx( w3, w0)
+ mov w2, -8(up)
+ jrcxz L(end)
+L(b2x): mulx( 8,(dp), w2, w3)
+ adox( (up), w0)
+ lea -1(n), n
+ mov w0, (up)
+L(b1x): adcx( w1, w2)
+ mulx( 16,(dp), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(up), w2)
+ mov w2, 8(up)
+L(b0x): mulx( 24,(dp), w2, w3)
+ lea 64(dp), dp
+ adcx( w1, w2)
+ adox( 16,(up), w0)
+ mov w0, 16(up)
+L(b7x): mulx( -32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+L(b6x): mulx( -24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(up), w0)
+ mov w0, 32(up)
+L(b5x): mulx( -16,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+L(b4x): adox( 48,(up), w0)
+ mulx( -8,(dp), w2, w3)
+ mov w0, 48(up)
+L(b3x): lea 64(up), up
+ adcx( w1, w2)
+ mulx( (dp), w0, w1)
+ jmp L(top)
+
+L(done):mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(sma):
+ifdef(`PIC',
+` movslq 28(%r10,dn_param,4), %rax
+ lea (%rax,%r10), jaddr
+',`
+ mov 56(%r10,dn_param,8), jaddr
+')
+ jmp *jaddr
+
+L(1): mov (dp_param), %r10
+ xor R32(%rax), R32(%rax)
+ mov (up), %rdx
+ dec un
+ mov %rdx, %r9
+L(o1): mulx( dinv, %rdx, %r11) C next quotient
+ lea 8(up), up
+ mulx( %r10, %rcx, %rdx) C 0 1
+ add %r9, %rcx C 0
+ adc %rax, %rdx C 1
+ add (up), %rdx C 1
+ setc R8(%rax) C 2
+ mov %rdx, %r9 C 1
+ dec un
+ jnz L(o1)
+ mov %r9, (up)
+
+ FUNC_EXIT()
+ ret
+
+ifdef(`VER',,`define(`VER',1)')
+L(2): push %r12
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ sub dn_param, un C loop count
+ mov (up), q
+ imul dinv, q
+
+ifelse(VER,0,`
+ xor R32(%rax), R32(%rax)
+L(o2): test %eax, %eax C clear CF and OF
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ add (up), w2 C 0
+ adc 8(up), %rdx C 1
+ adc $0, w1 C 2 cannot carry further
+ add w3, %rdx C 1
+ mov %rdx, 8(up) C 1
+ adc $0, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%rax)
+ adc 16(up), w1 C 2
+ mov w1, 16(up)
+ setc R8(%rax)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+')
+ifelse(VER,1,`
+ push %rbx
+ push %r13
+ xor R32(%r13), R32(%r13)
+ mov (up), %rax
+ mov 8(up), %rbx
+L(o2): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3) C 0 1
+ mulx( 8,(dp), %rdx, w1) C 1 2
+ adox( %rax, w2) C 0
+ adcx( w3, %rdx) C 1
+ adox( %rbx, %rdx) C 1
+ adox( %rcx, w1) C 2 cannot carry further
+ mov %rdx, %rax C 1
+ adc %rcx, w1 C 2
+ imul dinv, q C
+ bt $0, R32(%r13)
+ adc 16(up), w1 C 2
+ mov w1, %rbx
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r13
+ pop %rbx
+')
+ifelse(VER,2,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ mulx( 8,(dp), %rdx, %rcx)
+ add %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r10, %r12 C 0 add just to produce carry
+ adc %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ifelse(VER,3,`
+ xor R32(%rax), R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r9
+L(o2): mulx( (dp), %r12, %r11)
+ add %r10, %r12 C 0 add just to produce carry
+ mulx( 8,(dp), %rdx, %rcx)
+ adc %r11, %rdx C 1
+ adc $0, %rcx C 2
+ add %r9, %rdx C 1
+ mov %rdx, %r10 C 1
+ mulx( dinv, %rdx, %r12) C next quotient
+ adc %rax, %rcx C 2
+ setc R8(%rax) C 3
+ mov 16(up), %r9 C 2
+ add %rcx, %r9 C 2
+ adc $0, R32(%rax) C 3
+ lea 8(up), up
+ dec un
+ jnz L(o2)
+
+ mov %r10, (up)
+ mov %r9, 8(up)
+')
+ pop %r14
+ pop %r12
+ FUNC_EXIT()
+ ret
+
+ifelse(eval(MAX_SPECIAL>=3),1,`
+L(3): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF
+ mulx( (dp), w0, w1) C 0 1
+ adox( %rax, w0) C 0
+ mulx( 8,(dp), %rax, w3) C 1 2
+ adcx( w1, %rax) C 1
+ adox( %rbx, %rax) C 1
+ mulx( 16,(dp), %rbx, w1) C 2 3
+ mov dinv, q C 1
+ mulx( %rax, q, w0)
+ adcx( w3, %rbx) C 2
+ adox( 16,(up), %rbx) C 2
+ adox( %rcx, w1) C 3
+ adc $0, w1 C 3
+ bt $0, R32(%r13)
+ adc w1, 24(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o3)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=4),1,`
+L(4): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o4): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ mov dinv, q
+ mulx( %rax, q, w2)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 24(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 32(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o4)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=5),1,`
+L(5): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o5): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ adox( 16,(up), %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 32(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 40(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o5)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=6),1,`
+L(6): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp C free up rdx
+ xor %r13, %r13
+ sub dn_param, un C outer loop count
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o6): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ adox( 24,(up), w0)
+ mulx( 32,(dp), w2, w3)
+ mov w0, 24(up)
+ adcx( w1, w2)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 40(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 48(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o6)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=7),1,`
+L(7): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o7): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w0, w1)
+ adox( %rax, w0)
+ mulx( 8,(dp), %rax, w3)
+ adcx( w1, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w1)
+ adcx( w3, %rbx)
+ mulx( 24,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 16,(up), %rbx)
+ mulx( 32,(dp), w0, w1)
+ adox( 24,(up), w2)
+ adcx( w3, w0)
+ mov w2, 24(up)
+ adox( 32,(up), w0)
+ mulx( 40,(dp), w2, w3)
+ mov w0, 32(up)
+ adcx( w1, w2)
+ mulx( 48,(dp), w0, w1)
+ adox( 40,(up), w2)
+ adcx( w3, w0)
+ mov w2, 40(up)
+ mov %rax, q
+ mulx( dinv, q, w2)
+ adox( 48,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 48(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 56(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o7)
+ jmp L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=8),1,`
+L(8): push %rbx
+ push %r12
+ push %r13
+ push %r14
+
+ mov dp_param, dp
+ xor %r13, %r13
+ sub dn_param, un
+ mov (up), %rax
+ mov 8(up), %rbx
+ mov %rax, q
+ imul dinv, q
+L(o8): xor R32(%rcx), R32(%rcx)
+ mulx( (dp), w2, w3)
+ adox( %rax, w2)
+ mulx( 8,(dp), %rax, w1)
+ adcx( w3, %rax)
+ adox( %rbx, %rax)
+ mulx( 16,(dp), %rbx, w3)
+ adcx( w1, %rbx)
+ mulx( 24,(dp), w0, w1)
+ adox( 16,(up), %rbx)
+ adcx( w3, w0)
+ mulx( 32,(dp), w2, w3)
+ adcx( w1, w2)
+ adox( 24,(up), w0)
+ mov w0, 24(up)
+ mulx( 40,(dp), w0, w1)
+ adox( 32,(up), w2)
+ adcx( w3, w0)
+ mov w2, 32(up)
+ adox( 40,(up), w0)
+ mulx( 48,(dp), w2, w3)
+ mov w0, 40(up)
+ adcx( w1, w2)
+ mulx( 56,(dp), w0, w1)
+ adox( 48,(up), w2)
+ adcx( w3, w0)
+ mov dinv, q
+ mulx( %rax, q, w3)
+ mov w2, 48(up)
+ adox( 56,(up), w0)
+ adox( %rcx, w1)
+ mov w0, 56(up)
+ adc %rcx, w1
+ bt $0, R32(%r13)
+ adc w1, 64(up)
+ setc R8(%r13)
+ lea 8(up), up
+ dec un
+ jnz L(o8)
+ jmp L(esma)
+')
+
+L(esma):mov %rax, (up)
+ mov %rbx, 8(up)
+ mov %r13, %rax
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ JUMPTABSECT
+ ALIGN(8)
+L(atab):JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ JMPENT( L(1), L(atab))
+ JMPENT( L(2), L(atab))
+ JMPENT( L(3), L(atab))
+ JMPENT( L(4), L(atab))
+ JMPENT( L(5), L(atab))
+ JMPENT( L(6), L(atab))
+ JMPENT( L(7), L(atab))
+ JMPENT( L(8), L(atab))
+ TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
new file mode 100644
index 0000000..e81b01b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
@@ -0,0 +1,839 @@
+dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
+
+dnl Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 addmul_1
+C AMD K8,K9 n/a n/a
+C AMD K10 n/a n/a
+C AMD bd1 n/a n/a
+C AMD bd2 n/a n/a
+C AMD bd3 n/a n/a
+C AMD bd4 ? ?
+C AMD zen ? ?
+C AMD bt1 n/a n/a
+C AMD bt2 n/a n/a
+C Intel P4 n/a n/a
+C Intel PNR n/a n/a
+C Intel NHM n/a n/a
+C Intel SBR n/a n/a
+C Intel IBR n/a n/a
+C Intel HWL 1.68 n/a
+C Intel BWL 1.51 1.67-1.74
+C Intel SKL 1.52 1.63-1.71
+C Intel atom n/a n/a
+C Intel SLM n/a n/a
+C VIA nano n/a n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * We have 8 addmul_1 loops which fall into each other. The idea is to save
+C on switching code, since a circularly updated computed goto target will
+C hardly allow correct branch prediction. On 2nd thought, we now might make
+C each of the 8 loop branches be poorly predicted since they will be
+C executed fewer times for each time. With just one addmul_1 loop, the loop
+C count will change only once each 8th time.
+C * Do overlapped software pipelining.
+C * Perhaps load in shrx/sarx, eliminating separate load insn.
+C * Schedule add+stored in small n code.
+C * Try swapping adox and adcx insn, making mulx have more time to run.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+define(`n', `%rcx')
+define(`un_save', `%rbx')
+define(`u0', `%rdx')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, un_param
+ jae L(gt1)
+
+ mov (up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rdx
+ mov 8(up), %rcx
+ mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
+ mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
+ mov %rcx, %rdx
+ mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
+ add %r9, %r9 C W 1
+ adc %r10, %r10 C W 2
+ adc $0, %rdx C W 3
+ add %r9, %r8 C W 1
+ adc %r11, %r10 C W 2
+ adc $0, %rdx C W 3
+ mov %rax, (rp)
+ mov %r8, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, un_param
+ jae L(gt3)
+
+ push %rbx
+ mov (up), %rdx
+ mulx( 8,(up), w2, w3)
+ mulx( 16,(up), w0, w1)
+ add w3, w0
+ mov 8(up), %rdx
+ mulx( 16,(up), %rax, w3)
+ adc %rax, w1
+ adc $0, w3
+ test R32(%rbx), R32(%rbx)
+ mov (up), %rdx
+ mulx( %rdx, %rbx, %rcx)
+ mov %rbx, (rp)
+ mov 8(up), %rdx
+ mulx( %rdx, %rax, %rbx)
+ mov 16(up), %rdx
+ mulx( %rdx, %rsi, %rdx)
+ adcx( w2, w2)
+ adcx( w0, w0)
+ adcx( w1, w1)
+ adcx( w3, w3)
+ adox( w2, %rcx)
+ adox( w0, %rax)
+ adox( w1, %rbx)
+ adox( w3, %rsi)
+ mov $0, R32(%r8)
+ adox( %r8, %rdx)
+ adcx( %r8, %rdx)
+ mov %rcx, 8(rp)
+ mov %rax, 16(rp)
+ mov %rbx, 24(rp)
+ mov %rsi, 32(rp)
+ mov %rdx, 40(rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(gt3): push %rbx
+
+ lea -3(un_param), R32(un_save)
+ lea 5(un_param), R32(n)
+ mov R32(un_param), R32(%rax)
+ and $-8, R32(un_save)
+ shr $3, R32(n) C count for mul_1 loop
+ neg un_save C 8*count and offert for addmul_1 loops
+ and $7, R32(%rax) C clear CF for adc as side-effect
+
+ mov (up), u0
+
+ lea L(mtab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r8
+ lea (%r8, %r10), %r10
+ jmp *%r10
+',`
+ jmp *(%r10,%rax,8)
+')
+
+L(mf0): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ lea 64(up), up
+ add w1, w2
+ jmp L(mb0)
+
+L(mf3): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mov w2, (rp)
+ mulx( 8,(up), w0, w1)
+ lea 24(up), up
+ lea 24(rp), rp
+ add w3, w0
+ jmp L(mb3)
+
+L(mf4): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ add w1, w2
+ jmp L(mb4)
+
+L(mf5): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 40(up), up
+ lea 40(rp), rp
+ add w3, w0
+ jmp L(mb5)
+
+L(mf6): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 48(up), up
+ lea 48(rp), rp
+ add w1, w2
+ jmp L(mb6)
+
+L(mf7): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 56(up), up
+ lea 56(rp), rp
+ add w3, w0
+ jmp L(mb7)
+
+L(mf1): mulx( u0, w2, w3) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w0, w1)
+ mov w2, (rp)
+ lea 8(up), up
+ lea 8(rp), rp
+ add w3, w0
+ jmp L(mb1)
+
+L(mf2): mulx( u0, w0, w1) C up[0]^2
+ add u0, u0
+ mulx( 8,(up), w2, w3)
+ mov w0, (rp)
+ lea 16(up), up
+ lea 16(rp), rp
+ dec R32(n)
+ add w1, w2
+ mulx( (up), w0, w1)
+
+ ALIGN(16)
+L(top): mov w2, -8(rp)
+ adc w3, w0
+L(mb1): mulx( 8,(up), w2, w3)
+ adc w1, w2
+ lea 64(up), up
+L(mb0): mov w0, (rp)
+ mov w2, 8(rp)
+ mulx( -48,(up), w0, w1)
+ lea 64(rp), rp
+ adc w3, w0
+L(mb7): mulx( -40,(up), w2, w3)
+ mov w0, -48(rp)
+ adc w1, w2
+L(mb6): mov w2, -40(rp)
+ mulx( -32,(up), w0, w1)
+ adc w3, w0
+L(mb5): mulx( -24,(up), w2, w3)
+ mov w0, -32(rp)
+ adc w1, w2
+L(mb4): mulx( -16,(up), w0, w1)
+ mov w2, -24(rp)
+ adc w3, w0
+L(mb3): mulx( -8,(up), w2, w3)
+ adc w1, w2
+ mov w0, -16(rp)
+ dec R32(n)
+ mulx( (up), w0, w1)
+ jnz L(top)
+
+L(end): mov w2, -8(rp)
+ adc w3, w0
+C mov w0, (rp)
+C adc %rcx, w1
+C mov w1, 8(rp)
+
+ lea L(atab)(%rip), %r10
+ifdef(`PIC',
+` movslq (%r10,%rax,4), %r11
+ lea (%r11, %r10), %r11
+',`
+ mov (%r10,%rax,8), %r11
+')
+ mov $63, R32(%rax)
+ jmp *%r11
+
+L(ed0): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f7): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov (up), w1 C up[-1]
+ mov 8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ jmp L(b7)
+
+ ALIGN(16)
+L(tp0): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed0)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+L(b0): mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp0)
+
+L(ed1): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f0): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea -64(up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -8(up), w3 C up[-1]
+ mov (up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ jmp L(b0)
+
+ ALIGN(16)
+L(tp1): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed1)
+L(b1): mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp1)
+
+L(ed2): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f1): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea 8(un_save), un_save
+ lea -56(rp,un_save,8), rp
+ mov -16(up), w1 C up[-1]
+ mov -8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2) C FIXME: crossjump?
+ mulx( (up), w0, w1)
+ adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jmp L(b1)
+
+ ALIGN(16)
+L(tp2): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed2)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+L(b2): adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp2)
+
+L(ed3): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f2): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ or R32(un_save), R32(n)
+ jz L(cor3)
+ lea -56(rp,un_save,8), rp
+ mov -24(up), w3 C up[-1]
+ mov -16(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ jmp L(b2)
+
+ ALIGN(16)
+L(tp3): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed3)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+L(b3): mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp3)
+
+L(ed4): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f3): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -32(up), w1 C up[-1]
+ mov -24(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ jmp L(b3)
+
+ ALIGN(16)
+L(tp4): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed4)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+L(b4): mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp4)
+
+L(ed5): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f4): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -40(up), w3 C up[-1]
+ mov -32(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ jmp L(b4)
+
+ ALIGN(16)
+L(tp5): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed5)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+L(b5): mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp5)
+
+L(ed6): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f5): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -48(up), w1 C up[-1]
+ mov -40(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ jmp L(b5)
+
+ ALIGN(16)
+L(tp6): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed6)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+ adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+L(b6): adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp6)
+
+L(ed7): adox( (rp), w0)
+ adox( %rcx, w1) C relies on rcx = 0
+L(f6): mov w0, (rp)
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 8(rp)
+ lea (up,un_save,8), up
+ mov R32(un_save), R32(n)
+ lea -56(rp,un_save,8), rp
+ mov -56(up), w3 C up[-1]
+ mov -48(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ mulx( -40,(up), w2, w3)
+ jmp L(b6)
+
+ ALIGN(16)
+L(tp7): adox( -8,(rp), w2)
+ adcx( w3, w0)
+ mov w2, -8(rp)
+ jrcxz L(ed7)
+ mulx( 8,(up), w2, w3)
+ adox( (rp), w0)
+ lea 8(n), R32(n)
+ mov w0, (rp)
+L(b7): adcx( w1, w2)
+ mulx( 16,(up), w0, w1)
+ adcx( w3, w0)
+ adox( 8,(rp), w2)
+ mov w2, 8(rp)
+ mulx( 24,(up), w2, w3)
+ lea 64(up), up
+ adcx( w1, w2)
+ adox( 16,(rp), w0)
+ mov w0, 16(rp)
+ mulx( -32,(up), w0, w1)
+ adox( 24,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 24(rp)
+ mulx( -24,(up), w2, w3)
+ adcx( w1, w2)
+ adox( 32,(rp), w0)
+ mov w0, 32(rp)
+ mulx( -16,(up), w0, w1)
+ adox( 40,(rp), w2)
+ adcx( w3, w0)
+ mov w2, 40(rp)
+ adox( 48,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 48(rp)
+ lea 64(rp), rp
+ adcx( w1, w2)
+ mulx( (up), w0, w1)
+ jmp L(tp7)
+
+L(cor3):lea -64(rp), rp
+ mov -24(up), w3 C up[-1]
+ mov -16(up), u0 C up[0]
+ shrx( %rax, w3, w2)
+ sarx( %rax, w3, w3)
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ lea (w2,u0,2), u0 C "u0" arg in C code
+ adcx( w3, w0)
+ adox( 56,(rp), w0)
+ mulx( -8,(up), w2, w3)
+ mov w0, 56(rp)
+ adcx( w1, w2)
+ mulx( (up), %rbx, w1)
+ adox( 64,(rp), w2)
+ adcx( w3, %rbx)
+ mov w2, 64(rp)
+ adox( 72,(rp), %rbx)
+ adox( %rcx, w1) C relies on rcx = 0
+ adc %rcx, w1 C relies on rcx = 0
+ mov w1, 80(rp) C FIXME
+C wd2
+ mov -16(up), w1 C up[-1]
+ mov -8(up), u0 C up[0]
+ shrx( %rax, w1, w0)
+ sarx( %rax, w1, w1)
+ and u0, w1 C "ci" in C code
+ mulx( u0, w2, w3) C up[0]^2
+ lea (w0,u0,2), u0 C "u0" arg in C code
+ adcx( w1, w2)
+ mulx( (up), w0, %rax)
+ adox( %rbx, w2)
+ adcx( w3, w0)
+ mov w2, 72(rp)
+ adox( 80,(rp), w0)
+ adox( %rcx, %rax) C relies on rcx = 0
+ mov w0, 80(rp)
+ adc %rcx, %rax C relies on rcx = 0
+C wd1
+ mov -8(up), w3 C up[-1]
+ mov (up), u0 C up[0]
+ sar $63, w3
+ and u0, w3 C "ci" in C code
+ mulx( u0, w0, w1) C up[0]^2
+ adcx( w3, w0)
+ adox( %rax, w0)
+ mov w0, 88(rp)
+ adcx( %rcx, w1)
+ adox( %rcx, w1)
+ mov w1, 96(rp)
+
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf7), L(mtab))
+ JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
+L(atab):JMPENT( L(f6), L(atab))
+ JMPENT( L(f7), L(atab))
+ JMPENT( L(f0), L(atab))
+ JMPENT( L(f1), L(atab))
+ JMPENT( L(f2), L(atab))
+ JMPENT( L(f3), L(atab))
+ JMPENT( L(f4), L(atab))
+ JMPENT( L(f5), L(atab))
+ TEXT
+EPILOGUE()