7 files changed, 2963 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
new file mode 100644
index 0000000..8d3a44a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm
@@ -0,0 +1,210 @@
+dnl  AMD64 mpn_addmul_1 optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bd1	n/a
+C AMD bd2	n/a
+C AMD bd3	n/a
+C AMD bd4	 ?
+C AMD zen1	 ?
+C AMD zen2	 ?
+C AMD zen3	 1.5
+C AMD bt1	n/a
+C AMD bt2	n/a
+C Intel P4	n/a
+C Intel PNR	n/a
+C Intel NHM	n/a
+C Intel SBR	n/a
+C Intel IBR	n/a
+C Intel HWL	n/a
+C Intel BWL	 1.67	 1.74
+C Intel SKL	 1.63	 1.71
+C Intel atom	n/a
+C Intel SLM	n/a
+C VIA nano	n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Put an initial mulx before switching, targeting some free registers.
+C  * Tune feed-in code.
+C  * Trim nop execution after L(f2).
+C  * For DOS64, fix nop execution.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
+dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
+dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
+dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
+dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
+dnl IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_addmul_1)
+	FUNC_ENTRY(4)
+
+	mov	v0_param, %r10
+	mov	n_param, n
+	mov	R32(n_param), R32(%r8)
+	shr	$3, n
+	and	$7, R32(%r8)		C clear OF, CF as side-effect
+	mov	%r10, %rdx
+	lea	L(tab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%r8,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%r8,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(f0), L(tab))
+	JMPENT(	L(f1), L(tab))
+	JMPENT(	L(f2), L(tab))
+	JMPENT(	L(f3), L(tab))
+	JMPENT(	L(f4), L(tab))
+	JMPENT(	L(f5), L(tab))
+	JMPENT(	L(f6), L(tab))
+	JMPENT(	L(f7), L(tab))
+	TEXT
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	-8(up), up
+	lea	-8(rp), rp
+	lea	-1(n), n
+	jmp	L(b0)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	-48(rp), rp
+	jmp	L(b3)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	-40(rp), rp
+	jmp	L(b4)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	-32(rp), rp
+	jmp	L(b5)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	-24(rp), rp
+	jmp	L(b6)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	jrcxz	L(1)
+	jmp	L(b1)
+L(1):	add	(rp), %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax		C relies on rcx = 0
+	FUNC_EXIT()
+	ret
+
+L(end):	adox(	(rp), %r9)
+	mov	%r9, (rp)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	adc	%rcx, %rax		C relies on rcx = 0
+	FUNC_EXIT()
+	ret
+
+ifdef(`PIC',
+`	nop;nop;nop;nop',
+`	nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+
+	ALIGN(32)
+L(top):	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	jrcxz	L(end)
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adox(	(rp), %r9)
+	lea	-1(n), n
+	mov	%r9, (rp)
+	adcx(	%rax, %r10)
+L(b0):	mulx(	16,(up), %r9, %rax)
+	adcx(	%r8, %r9)
+	adox(	8,(rp), %r10)
+	mov	%r10, 8(rp)
+L(b7):	mulx(	24,(up), %r10, %r8)
+	lea	64(up), up
+	adcx(	%rax, %r10)
+	adox(	16,(rp), %r9)
+	mov	%r9, 16(rp)
+L(b6):	mulx(	-32,(up), %r9, %rax)
+	adox(	24,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 24(rp)
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	adcx(	%rax, %r10)
+	adox(	32,(rp), %r9)
+	mov	%r9, 32(rp)
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	adox(	40,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 40(rp)
+L(b3):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	jmp	L(top)
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	-16(up), up
+	lea	-16(rp), rp
+	jmp	L(b7)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..91c91b5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h
@@ -0,0 +1,246 @@
+/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 467,964,472 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              24
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             455
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               202
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     198
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                426
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             46
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    460, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39, 8}, \
+    {     79,10}, {     23, 9}, {     55,11}, {     15,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,12}, {     95,11}, {    191,10}, {    383,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    607,13}, {    319,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,13}, {    639,12}, {   1279,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,13}, {   1407,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1599,12}, {   3199,13}, {   1663,14}, \
+    {    895,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,15}, {    767,14}, {   1535,13}, \
+    {   3199,14}, {   1663,13}, {   3455,12}, {   6911,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4479,14}, \
+    {   2303,13}, {   4607,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2815,13}, {   5631,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,16}, {   1023,15}, \
+    {   2047,14}, {   4479,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5247,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4351,14}, \
+    {   8703,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,17}, {   2047,16}, {   4095,15}, \
+    {   8703,16}, {   4607,15}, {   9983,14}, {  19967,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             400  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    400, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    127,11}, {     79,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319,12}, \
+    {     95, 8}, {   1599, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,10}, {    735,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    671,12}, {    351,11}, {    735,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    799,13}, \
+    {    447,12}, {    959,13}, {    511,12}, {   1023,13}, \
+    {    575,12}, {   1151,13}, {    639,12}, {   1279,13}, \
+    {    703,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1151,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2175,13}, \
+    {   4351,14}, {   2303,13}, {   4607,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4351,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,17}, \
+    {   2047,16}, {   4095,15}, {   8447,16}, {   4607,15}, \
+    {   9471,14}, {  18943,15}, {   9983,14}, {  19967,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 215
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  80
+#define MULLO_MUL_N_THRESHOLD            11025
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                 109
+#define SQRLO_SQR_THRESHOLD               7293
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                86
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD               171
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_2_THRESHOLD          33
+#define REDC_2_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD               67
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1866
+
+#define POWM_SEC_TABLE  2,10,191,494,712,1378
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               644
+#define SET_STR_PRECOMPUTE_THRESHOLD      1658
+
+#define FAC_DSC_THRESHOLD                  562
+#define FAC_ODD_THRESHOLD                   48
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    5  /* 0.38% faster than 3 */
+#define HGCD_THRESHOLD                      73
+#define HGCD_APPR_THRESHOLD                 67
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   630
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   1  /* 29.65% faster than 4 */
+
+/* Tuneup completed successfully, took 239050 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
new file mode 100644
index 0000000..b7fae2f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm
@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_mul_1 optimised for Intel Broadwell.
+
+dnl  Copyright 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      -
+C AMD K10        -
+C AMD bull       -
+C AMD pile       -
+C AMD steam      -
+C AMD excavator  -
+C AMD bobcat     -
+C AMD jaguar     -
+C Intel P4       -
+C Intel core2    -
+C Intel NHM      -
+C Intel SBR      -
+C Intel IBR      -
+C Intel HWL      1.70
+C Intel BWL      1.51
+C Intel SKL      1.52
+C Intel atom     -
+C Intel SLM      -
+C VIA nano       -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Put an initial mulx before switching, targeting some free registers.
+C  * Tune feed-in code.
+C  * Trim nop execution after L(f2).
+C  * Port to DOS64, not forgetting nop execution.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+
+dnl ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
+dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
+dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
+dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
+dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
+dnl IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_1)
+
+	mov	v0_param, %r10
+	mov	n_param, n
+	mov	R32(n_param), R32(%r8)
+	shr	$3, n
+	and	$7, R32(%r8)		C clear OF, CF as side-effect
+	mov	%r10, %rdx
+	lea	L(tab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%r8,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%r8,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(f0), L(tab))
+	JMPENT(	L(f1), L(tab))
+	JMPENT(	L(f2), L(tab))
+	JMPENT(	L(f3), L(tab))
+	JMPENT(	L(f4), L(tab))
+	JMPENT(	L(f5), L(tab))
+	JMPENT(	L(f6), L(tab))
+	JMPENT(	L(f7), L(tab))
+	TEXT
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	56(up), up
+	lea	-8(rp), rp
+	jmp	L(b0)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	16(rp), rp
+	inc	n
+	jmp	L(b3)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	n
+	jmp	L(b4)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	n
+	jmp	L(b5)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	n
+	jmp	L(b6)
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	48(up), up
+	lea	48(rp), rp
+	inc	n
+	jmp	L(b7)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	test	n, n
+	jnz	L(b1)
+L(1):	mov	%r9, (rp)
+	ret
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+	test	n, n
+	jz	L(end)
+
+	ALIGN(32)
+L(top):	mov	%r10, -8(rp)
+	adc	%r8, %r9
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r10
+	lea	64(up), up
+	mov	%r9, (rp)
+L(b0):	mov	%r10, 8(rp)
+	mulx(	-48,(up), %r9, %rax)
+	lea	64(rp), rp
+	adc	%r8, %r9
+L(b7):	mulx(	-40,(up), %r10, %r8)
+	mov	%r9, -48(rp)
+	adc	%rax, %r10
+L(b6):	mov	%r10, -40(rp)
+	mulx(	-32,(up), %r9, %rax)
+	adc	%r8, %r9
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	mov	%r9, -32(rp)
+	adc	%rax, %r10
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	mov	%r10, -24(rp)
+	adc	%r8, %r9
+L(b3):	mulx(	-8,(up), %r10, %r8)
+	adc	%rax, %r10
+	mov	%r9, -16(rp)
+	dec	n
+	mulx(	(up), %r9, %rax)
+	jnz	L(top)
+
+L(end):	mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax
+	ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
new file mode 100644
index 0000000..7ca5a9b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm
@@ -0,0 +1,368 @@
+dnl  AMD64 mpn_mul_basecase optimised for Intel Broadwell.
+
+dnl  Copyright 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zen	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Do overlapped software pipelining.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops (for both PIC and non-PIC).
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn',      `%r8')
+
+define(`n',       `%rcx')
+define(`n_save',  `%rbp')
+define(`vp',      `%r14')
+define(`unneg',   `%rbx')
+define(`v0',      `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	cmp	$2, un_param
+	ja	L(gen)
+	mov	(vp_param), %rdx
+	mulx(	(up), %rax, %r9)	C 0 1
+	je	L(s2x)
+
+L(s11):	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(s2x):	cmp	$2, vn
+	mulx(	8,(up), %r8, %r10)	C 1 2
+	je	L(s22)
+
+L(s21):	add	%r8, %r9
+	adc	$0, %r10
+	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(s22):	add	%r8, %r9		C 1
+	adc	$0, %r10		C 2
+	mov	8(vp_param), %rdx
+	mov	%rax, (rp)
+	mulx(	(up), %r8, %r11)	C 1 2
+	mulx(	8,(up), %rax, %rdx)	C 2 3
+	add	%r11, %rax		C 2
+	adc	$0, %rdx		C 3
+	add	%r8, %r9		C 1
+	adc	%rax, %r10		C 2
+	adc	$0, %rdx		C 3
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(gen):
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r14
+
+	mov	vp_param, vp
+	lea	1(un_param), unneg
+	mov	un_param, n_save
+	mov	R32(un_param), R32(%rax)
+	and	$-8, unneg
+	shr	$3, n_save		C loop count
+	neg	unneg
+	and	$7, R32(%rax)		C clear CF for adc as side-effect
+					C note that rax lives very long
+	mov	n_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r11
+	lea	(%r11, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	(up), w2, w3)
+	lea	56(up), up
+	lea	-8(rp), rp
+	jmp	L(mb0)
+
+L(mf3):	mulx(	(up), w0, w1)
+	lea	16(up), up
+	lea	16(rp), rp
+	inc	n
+	jmp	L(mb3)
+
+L(mf4):	mulx(	(up), w2, w3)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	n
+	jmp	L(mb4)
+
+L(mf5):	mulx(	(up), w0, w1)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	n
+	jmp	L(mb5)
+
+L(mf6):	mulx(	(up), w2, w3)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	n
+	jmp	L(mb6)
+
+L(mf7):	mulx(	(up), w0, w1)
+	lea	48(up), up
+	lea	48(rp), rp
+	inc	n
+	jmp	L(mb7)
+
+L(mf1):	mulx(	(up), w0, w1)
+	jmp	L(mb1)
+
+L(mf2):	mulx(	(up), w2, w3)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(m1top):
+	mov	w2, -8(rp)
+	adc	w3, w0
+L(mb1):	mulx(	8,(up), w2, w3)
+	adc	w1, w2
+	lea	64(up), up
+	mov	w0, (rp)
+L(mb0):	mov	w2, 8(rp)
+	mulx(	-48,(up), w0, w1)
+	lea	64(rp), rp
+	adc	w3, w0
+L(mb7):	mulx(	-40,(up), w2, w3)
+	mov	w0, -48(rp)
+	adc	w1, w2
+L(mb6):	mov	w2, -40(rp)
+	mulx(	-32,(up), w0, w1)
+	adc	w3, w0
+L(mb5):	mulx(	-24,(up), w2, w3)
+	mov	w0, -32(rp)
+	adc	w1, w2
+L(mb4):	mulx(	-16,(up), w0, w1)
+	mov	w2, -24(rp)
+	adc	w3, w0
+L(mb3):	mulx(	-8,(up), w2, w3)
+	adc	w1, w2
+	mov	w0, -16(rp)
+	dec	n
+	mulx(	(up), w0, w1)
+	jnz	L(m1top)
+
+L(m1end):
+	mov	w2, -8(rp)
+	adc	w3, w0
+	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+
+	dec	vn
+	jz	L(done)
+
+	lea	L(atab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax, %r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+
+L(outer):
+	lea	(up,unneg,8), up
+	mov	n_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+	jmp	*jaddr
+
+L(f0):	mulx(	8,(up), w2, w3)
+	lea	8(rp,unneg,8), rp
+	lea	-1(n), n
+	jmp	L(b0)
+
+L(f3):	mulx(	-16,(up), w0, w1)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b3)
+
+L(f4):	mulx(	-24,(up), w2, w3)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b4)
+
+L(f5):	mulx(	-32,(up), w0, w1)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b5)
+
+L(f6):	mulx(	-40,(up), w2, w3)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b6)
+
+L(f7):	mulx(	16,(up), w0, w1)
+	lea	8(rp,unneg,8), rp
+	jmp	L(b7)
+
+L(f1):	mulx(	(up), w0, w1)
+	lea	8(rp,unneg,8), rp
+	jmp	L(b1)
+
+L(am1end):
+	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+
+	dec	vn			C clear OF as side-effect
+	jnz	L(outer)
+L(done):
+	pop	%r14
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(f2):	mulx(	-8,(up), w2, w3)
+	lea	8(rp,unneg,8), rp
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(am1top):
+	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(am1end)
+L(b1):	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	-1(n), n
+	mov	w0, (rp)
+	adcx(	w1, w2)
+L(b0):	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+L(b7):	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+L(b6):	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+L(b5):	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+L(b4):	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+L(b3):	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(am1top)
+
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
+	JMPENT(	L(mf7), L(mtab))
+L(atab):JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
new file mode 100644
index 0000000..5cdb209
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm
@@ -0,0 +1,395 @@
+dnl  X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',	   `%rdi')
+define(`up',	   `%rsi')
+define(`vp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`vp',	`%r11')
+define(`jmpreg',`%rbx')
+define(`nn',    `%rbp')
+
+C TODO
+C  * Suppress more rp[] rewrites in corner.
+C  * Rearrange feed-in jumps for short branch forms.
+C  * Perhaps roll out the heavy artillery and 8-way unroll outer loop.  Since
+C    feed-in code implodes, the blow-up will not be more than perhaps 4x.
+C  * Micro-optimise critical lead-in code block around L(ent).
+C  * Write n < 4 code specifically for Broadwell (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+	cmp	$4, R32(n)
+	jae	L(big)
+
+	mov	vp_param, vp
+	mov	(up), %rdx
+
+	cmp	$2, R32(n)
+	jae	L(gt1)
+L(n1):	imul	(vp), %rdx
+	mov	%rdx, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp), %r9
+	mulx(	%r9, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp), %r9
+	mulx(	%r9, %rax, %r10)	C u0 x v0
+	mov	%rax, (rp)
+	mov	8(up), %rdx
+	mulx(	%r9, %rax, %rdx)	C u1 x v0
+	imul	16(up), %r9		C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r8
+	mov	(up), %rdx
+	mulx(	%r8, %rax, %rdx)	C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r8		C u1 x v1
+	add	%r8, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10		C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(big):	push	%r14
+	push	%r12
+	push	%rbx
+	push	%rbp
+	mov	-8(vp_param,n,8), %r14	C FIXME Put at absolute end
+	imul	(up), %r14		C FIXME Put at absolute end
+	lea	-3(n), R32(nn)
+	lea	8(vp_param), vp
+	mov	(vp_param), %rdx
+
+	mov	R32(n), R32(%rax)
+	shr	$3, R32(n)
+	and	$7, R32(%rax)		C clear OF, CF as side-effect
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	(up), %r10, %r8)
+	lea	56(up), up
+	lea	-8(rp), rp
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(mb0)
+
+L(mf3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	16(rp), rp
+	jrcxz	L(mc)
+	inc	R32(n)
+	lea	L(f2)(%rip), jmpreg
+	jmp	L(mb3)
+
+L(mc):	mulx(	-8,(up), %r10, %r8)
+	add	%rax, %r10
+	mov	%r9, -16(rp)
+	mulx(	(up), %r9, %rax)
+	mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	jmp	L(c2)
+
+L(mf4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	R32(n)
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(mb4)
+
+L(mf5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	R32(n)
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(mb5)
+
+L(mf6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	R32(n)
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(mb6)
+
+L(mf7):	mulx(	(up), %r9, %rax)
+	lea	48(up), up
+	lea	48(rp), rp
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(mb7)
+
+L(mf1):	mulx(	(up), %r9, %rax)
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(mb1)
+
+L(mf2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	lea	L(f1)(%rip), jmpreg
+	mulx(	(up), %r9, %rax)
+
+C FIXME ugly fallthrough FIXME
+	ALIGN(32)
+L(mtop):mov	%r10, -8(rp)
+	adc	%r8, %r9
+L(mb1):	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r10
+	lea	64(up), up
+	mov	%r9, (rp)
+L(mb0):	mov	%r10, 8(rp)
+	mulx(	-48,(up), %r9, %rax)
+	lea	64(rp), rp
+	adc	%r8, %r9
+L(mb7):	mulx(	-40,(up), %r10, %r8)
+	mov	%r9, -48(rp)
+	adc	%rax, %r10
+L(mb6):	mov	%r10, -40(rp)
+	mulx(	-32,(up), %r9, %rax)
+	adc	%r8, %r9
+L(mb5):	mulx(	-24,(up), %r10, %r8)
+	mov	%r9, -32(rp)
+	adc	%rax, %r10
+L(mb4):	mulx(	-16,(up), %r9, %rax)
+	mov	%r10, -24(rp)
+	adc	%r8, %r9
+L(mb3):	mulx(	-8,(up), %r10, %r8)
+	adc	%rax, %r10
+	mov	%r9, -16(rp)
+	dec	R32(n)
+	mulx(	(up), %r9, %rax)
+	jnz	L(mtop)
+
+L(mend):mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax
+
+	lea	8(,nn,8), %r12
+	neg	%r12
+	shr	$3, R32(nn)
+	jmp	L(ent)
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	-8(up), up
+	lea	-8(rp), rp
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(b0)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	lea	-1(nn), R32(nn)
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(b1)
+
+L(end):	adox(	(rp), %r9)
+	mov	%r9, (rp)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	adc	%rcx, %rax		C FIXME suppress, use adc below; reqs ent path edits
+	lea	8(%r12), %r12
+L(ent):	mulx(	8,(up), %r10, %r8)	C r8 unused (use imul?)
+	add	%rax, %r14
+	add	%r10, %r14		C h
+	lea	(up,%r12), up		C reset up
+	lea	8(rp,%r12), rp		C reset rp
+	mov	(vp), %rdx
+	lea	8(vp), vp
+	or	R32(nn), R32(n)		C copy count, clear CF,OF (n = 0 prior)
+	jmp	*jmpreg
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	-16(up), up
+	lea	-16(rp), rp
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(b7)
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+	lea	L(f1)(%rip), jmpreg
+
+C FIXME ugly fallthrough FIXME
+	ALIGN(32)
+L(top):	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	jrcxz	L(end)
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adox(	(rp), %r9)
+	lea	-1(n), R32(n)
+	mov	%r9, (rp)
+	adcx(	%rax, %r10)
+L(b0):	mulx(	16,(up), %r9, %rax)
+	adcx(	%r8, %r9)
+	adox(	8,(rp), %r10)
+	mov	%r10, 8(rp)
+L(b7):	mulx(	24,(up), %r10, %r8)
+	lea	64(up), up
+	adcx(	%rax, %r10)
+	adox(	16,(rp), %r9)
+	mov	%r9, 16(rp)
+L(b6):	mulx(	-32,(up), %r9, %rax)
+	adox(	24,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 24(rp)
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	adcx(	%rax, %r10)
+	adox(	32,(rp), %r9)
+	mov	%r9, 32(rp)
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	adox(	40,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 40(rp)
+L(b3):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	jmp	L(top)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	-24(rp), rp
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(b6)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	-32(rp), rp
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(b5)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	-40(rp), rp
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(b4)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	-48(rp), rp
+	jrcxz	L(cor)
+	lea	L(f2)(%rip), jmpreg
+	jmp	L(b3)
+
+L(cor):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)		C FIXME suppress
+	adox(	(rp), %r9)
+	mov	%r9, (rp)		C FIXME suppress
+	adox(	%rcx, %rax)
+L(c2):
+	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r14
+	add	%r10, %r14
+	mov	(vp), %rdx
+	test	R32(%rcx), R32(%rcx)
+	mulx(	-16,(up), %r10, %r8)
+	mulx(	-8,(up), %r9, %rax)
+	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	adox(	(rp), %r9)
+	adox(	%rcx, %rax)
+	adc	%rcx, %rax
+	mulx(	(up), %r10, %r8)
+	add	%rax, %r14
+	add	%r10, %r14
+	mov	8(vp), %rdx
+	mulx(	-16,(up), %rcx, %rax)
+	add	%r9, %rcx
+	mov	%rcx, (rp)
+	adc	$0, %rax
+	mulx(	-8,(up), %r10, %r8)
+	add	%rax, %r14
+	add	%r10, %r14
+	mov	%r14, 8(rp)
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r14
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf7), L(mtab))
+	JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..ff35124
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
@@ -0,0 +1,710 @@
+dnl  AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zn1	 ?		 ?
+C AMD zn2	 ?		 ?
+C AMD zn3	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Do overlapped software pipelining.
+C  * Reduce register use, i.e., by combining n_neg and n_save.
+C  * Supporess initial store through up, it's always a zero.
+C  * Streamline up and dp setup.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl  mp_limb_t
+dnl  mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl		       mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up',      `%rdi')
+define(`un',      `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv',    `%r8')
+
+define(`n',       `%rcx')
+define(`n_save',  `%rbp')
+define(`dp',      `%r14')
+define(`n_neg',   `%rbx')
+define(`q',       `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+
+	lea	L(atab)(%rip), %r10
+
+	cmp	$MAX_SPECIAL, dn_param
+	jbe	L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen):	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+
+	sub	dn_param, un		C outer loop count
+
+	lea	-8(,dn_param,8), n_neg
+	neg	n_neg
+	mov	dn_param, n_save
+	mov	R32(dn_param), R32(%rax)
+	shr	$3, n_save		C loop count
+	and	$7, R32(%rax)		C clear CF and OF as side-effect
+
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+	mov	(up), q
+	imul	dinv, q
+	jmp	L(outer)
+
+L(f0):	mulx(	(dp), w2, w3)
+	lea	-1(n), n
+	mulx(	8,(dp), w0, w1)
+	lea	-8(dp), dp
+	adcx(	w3, w0)
+	adox(	(up), w2)
+	lea	-8(up), up
+	jmp	L(b0x)
+
+L(f3):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-48(up), up
+	lea	16(dp), dp
+	jmp	L(b3x)
+
+L(f4):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	24(dp), dp
+	adox(	(up), w2)
+	lea	-40(up), up
+	adcx(	w3, w0)
+	jmp	L(b4x)
+
+L(f5):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	32(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-32(up), up
+	jmp	L(b5x)
+
+L(f6):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	40(dp), dp
+	adox(	(up), w2)
+	lea	-24(up), up
+	adcx(	w3, w0)
+	jmp	L(b6x)
+
+L(f7):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	48(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-16(up), up
+	jmp	L(b7x)
+
+L(f1):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	jmp	L(b1x)
+
+L(f2):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	8(dp), dp
+	adox(	(up), w2)
+	lea	8(up), up
+	adcx(	w3, w0)
+	jmp	L(b2x)
+
+L(end):	adox(	(up), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	mov	w0, (up)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	8(up,n_neg), q		C Compute next quotient early...
+	mulx(	dinv, q, %r12)		C ...(unused in last iteration)
+	bt	$0, R32(%r13)
+	adc	w1, 8(up)
+	setc	R8(%r13)
+	dec	un			C clear OF as side-effect
+	jz	L(done)
+
+	lea	(dp,n_neg), dp		C reset dp to D[]'s beginning
+	lea	8(up,n_neg), up		C point up to U[]'s current beginning
+L(outer):
+	mov	n_save, n
+	test	%eax, %eax		C clear CF and OF
+	jmp	*jaddr
+
+	ALIGN(16)
+L(top):	adox(	-8,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(up)
+	jrcxz	L(end)
+L(b2x):	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	mov	w0, (up)
+L(b1x):	adcx(	w1, w2)
+	mulx(	16,(dp), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(up), w2)
+	mov	w2, 8(up)
+L(b0x):	mulx(	24,(dp), w2, w3)
+	lea	64(dp), dp
+	adcx(	w1, w2)
+	adox(	16,(up), w0)
+	mov	w0, 16(up)
+L(b7x):	mulx(	-32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(up)
+L(b6x):	mulx(	-24,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(up), w0)
+	mov	w0, 32(up)
+L(b5x):	mulx(	-16,(dp), w0, w1)
+	adox(	40,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(up)
+L(b4x):	adox(	48,(up), w0)
+	mulx(	-8,(dp), w2, w3)
+	mov	w0, 48(up)
+L(b3x):	lea	64(up), up
+	adcx(	w1, w2)
+	mulx(	(dp), w0, w1)
+	jmp	L(top)
+
+L(done):mov	%r13, %rax
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(sma):
+ifdef(`PIC',
+`	movslq	28(%r10,dn_param,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	56(%r10,dn_param,8), jaddr
+')
+	jmp	*jaddr
+
+L(1):	mov	(dp_param), %r10
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %rdx
+	dec	un
+	mov	%rdx, %r9
+L(o1):	mulx(	dinv, %rdx, %r11)	C next quotient
+	lea	8(up), up
+	mulx(	%r10, %rcx, %rdx)	C 0 1
+	add	%r9, %rcx		C 0
+	adc	%rax, %rdx		C 1
+	add	(up), %rdx		C 1
+	setc	R8(%rax)		C 2
+	mov	%rdx, %r9		C 1
+	dec	un
+	jnz	L(o1)
+	mov	%r9, (up)
+
+	FUNC_EXIT()
+	ret
+
+ifdef(`VER',,`define(`VER',1)')
+L(2):	push	%r12
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	sub	dn_param, un		C loop count
+	mov	(up), q
+	imul	dinv, q
+
+ifelse(VER,0,`
+	xor	R32(%rax), R32(%rax)
+L(o2):	test	%eax, %eax		C clear CF and OF
+	mulx(	(dp), w2, w3)		C 0 1
+	mulx(	8,(dp), %rdx, w1)		C 1 2
+	add	(up), w2		C 0
+	adc	8(up), %rdx		C 1
+	adc	$0, w1			C 2 cannot carry further
+	add	w3, %rdx			C 1
+	mov	%rdx, 8(up)		C 1
+	adc	$0, w1			C 2
+	imul	dinv, q			C
+	bt	$0, R32(%rax)
+	adc	16(up), w1		C 2
+	mov	w1, 16(up)
+	setc	R8(%rax)
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+')
+ifelse(VER,1,`
+	push	%rbx
+	push	%r13
+	xor	R32(%r13), R32(%r13)
+	mov	(up), %rax
+	mov	8(up), %rbx
+L(o2):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)		C 0 1
+	mulx(	8,(dp), %rdx, w1)	C 1 2
+	adox(	%rax, w2)		C 0
+	adcx(	w3, %rdx)		C 1
+	adox(	%rbx, %rdx)		C 1
+	adox(	%rcx, w1)		C 2 cannot carry further
+	mov	%rdx, %rax		C 1
+	adc	%rcx, w1		C 2
+	imul	dinv, q			C
+	bt	$0, R32(%r13)
+	adc	16(up), w1		C 2
+	mov	w1, %rbx
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%rax, (up)
+	mov	%rbx, 8(up)
+	mov	%r13, %rax
+	pop	%r13
+	pop	%rbx
+')
+ifelse(VER,2,`
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %r10
+	mov	8(up), %r9
+L(o2):	mulx(	(dp), %r12, %r11)
+	mulx(	8,(dp), %rdx, %rcx)
+	add	%r11, %rdx		C 1
+	adc	$0, %rcx		C 2
+	add	%r10, %r12		C 0  add just to produce carry
+	adc	%r9, %rdx		C 1
+	mov	%rdx, %r10		C 1
+	mulx(	dinv, %rdx, %r12)	C next quotient
+	adc	%rax, %rcx		C 2
+	setc	R8(%rax)		C 3
+	mov	16(up), %r9		C 2
+	add	%rcx, %r9		C 2
+	adc	$0, R32(%rax)		C 3
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%r10, (up)
+	mov	%r9, 8(up)
+')
+ifelse(VER,3,`
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %r10
+	mov	8(up), %r9
+L(o2):	mulx(	(dp), %r12, %r11)
+	add	%r10, %r12		C 0  add just to produce carry
+	mulx(	8,(dp), %rdx, %rcx)
+	adc	%r11, %rdx		C 1
+	adc	$0, %rcx		C 2
+	add	%r9, %rdx		C 1
+	mov	%rdx, %r10		C 1
+	mulx(	dinv, %rdx, %r12)	C next quotient
+	adc	%rax, %rcx		C 2
+	setc	R8(%rax)		C 3
+	mov	16(up), %r9		C 2
+	add	%rcx, %r9		C 2
+	adc	$0, R32(%rax)		C 3
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%r10, (up)
+	mov	%r9, 8(up)
+')
+	pop	%r14
+	pop	%r12
+	FUNC_EXIT()
+	ret
+
+ifelse(eval(MAX_SPECIAL>=3),1,`
+L(3):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o3):	xor	R32(%rcx), R32(%rcx)	C clear rcx, CF, and OF
+	mulx(	(dp), w0, w1)		C 0 1
+	adox(	%rax, w0)		C 0
+	mulx(	8,(dp), %rax, w3)	C 1 2
+	adcx(	w1, %rax)		C 1
+	adox(	%rbx, %rax)		C 1
+	mulx(	16,(dp), %rbx, w1)	C 2 3
+	mov	dinv, q			C 1
+	mulx(	%rax, q, w0)
+	adcx(	w3, %rbx)		C 2
+	adox(	16,(up), %rbx)		C 2
+	adox(	%rcx, w1)		C 3
+	adc	$0, w1			C 3
+	bt	$0, R32(%r13)
+	adc	w1, 24(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o3)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=4),1,`
+L(4):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o4):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	mov	dinv, q
+	mulx(	%rax, q, w2)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	adox(	24,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 24(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 32(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o4)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=5),1,`
+L(5):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o5):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w0, w1)
+	adox(	%rax, w0)
+	mulx(	8,(dp), %rax, w3)
+	adcx(	w1, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w1)
+	adcx(	w3, %rbx)
+	adox(	16,(up), %rbx)
+	mulx(	24,(dp), w2, w3)
+	adcx(	w1, w2)
+	mulx(	32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 24(up)
+	adox(	32,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 32(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 40(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o5)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=6),1,`
+L(6):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o6):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	adox(	24,(up), w0)
+	mulx(	32,(dp), w2, w3)
+	mov	w0, 24(up)
+	adcx(	w1, w2)
+	mulx(	40,(dp), w0, w1)
+	adox(	32,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 32(up)
+	adox(	40,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 40(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 48(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o6)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=7),1,`
+L(7):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp
+	xor	%r13, %r13
+	sub	dn_param, un
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o7):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w0, w1)
+	adox(	%rax, w0)
+	mulx(	8,(dp), %rax, w3)
+	adcx(	w1, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w1)
+	adcx(	w3, %rbx)
+	mulx(	24,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	16,(up), %rbx)
+	mulx(	32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(up)
+	adox(	32,(up), w0)
+	mulx(	40,(dp), w2, w3)
+	mov	w0, 32(up)
+	adcx(	w1, w2)
+	mulx(	48,(dp), w0, w1)
+	adox(	40,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(up)
+	mov	%rax, q
+	mulx(	dinv, q, w2)
+	adox(	48,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 48(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 56(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o7)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=8),1,`
+L(8):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp
+	xor	%r13, %r13
+	sub	dn_param, un
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o8):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	mulx(	32,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	24,(up), w0)
+	mov	w0, 24(up)
+	mulx(	40,(dp), w0, w1)
+	adox(	32,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 32(up)
+	adox(	40,(up), w0)
+	mulx(	48,(dp), w2, w3)
+	mov	w0, 40(up)
+	adcx(	w1, w2)
+	mulx(	56,(dp), w0, w1)
+	adox(	48,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 48(up)
+	adox(	56,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 56(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 64(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o8)
+	jmp	L(esma)
+')
+
+L(esma):mov	%rax, (up)
+	mov	%rbx, 8(up)
+	mov	%r13, %rax
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+
+	JUMPTABSECT
+	ALIGN(8)
+L(atab):JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	JMPENT(	L(1), L(atab))
+	JMPENT(	L(2), L(atab))
+	JMPENT(	L(3), L(atab))
+	JMPENT(	L(4), L(atab))
+	JMPENT(	L(5), L(atab))
+	JMPENT(	L(6), L(atab))
+	JMPENT(	L(7), L(atab))
+	JMPENT(	L(8), L(atab))
+	TEXT
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
new file mode 100644
index 0000000..e81b01b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm
@@ -0,0 +1,839 @@
+dnl  AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zen	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * We have 8 addmul_1 loops which fall into each other.  The idea is to save
+C    on switching code, since a circularly updated computed goto target will
+C    hardly allow correct branch prediction.  On 2nd thought, we now might make
+C    each of the 8 loop branches be poorly predicted since they will be
+C    executed fewer times for each time.  With just one addmul_1 loop, the loop
+C    count will change only once each 8th time.
+C  * Do overlapped software pipelining.
+C  * Perhaps load in shrx/sarx, eliminating separate load insn.
+C  * Schedule add+stored in small n code.
+C  * Try swapping adox and adcx insn, making mulx have more time to run.
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+
+define(`n',       `%rcx')
+define(`un_save', `%rbx')
+define(`u0',      `%rdx')
+
+define(`w0',	`%r8')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$2, un_param
+	jae	L(gt1)
+
+	mov	(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+
+	mov	(up), %rdx
+	mov	8(up), %rcx
+	mulx(	%rcx, %r9, %r10)	C v0 * v1	W 1 2
+	mulx(	%rdx, %rax, %r8)	C v0 * v0	W 0 1
+	mov	%rcx, %rdx
+	mulx(	%rdx, %r11, %rdx)	C v1 * v1	W 2 3
+	add	%r9, %r9		C		W 1
+	adc	%r10, %r10		C		W 2
+	adc	$0, %rdx		C		W 3
+	add	%r9, %r8		C W 1
+	adc	%r11, %r10		C W 2
+	adc	$0, %rdx		C W 3
+	mov	%rax, (rp)
+	mov	%r8, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):	cmp	$4, un_param
+	jae	L(gt3)
+
+	push	%rbx
+	mov	(up), %rdx
+	mulx(	8,(up), w2, w3)
+	mulx(	16,(up), w0, w1)
+	add	w3, w0
+	mov	8(up), %rdx
+	mulx(	16,(up), %rax, w3)
+	adc	%rax, w1
+	adc	$0, w3
+	test	R32(%rbx), R32(%rbx)
+	mov	(up), %rdx
+	mulx(	%rdx, %rbx, %rcx)
+	mov	%rbx, (rp)
+	mov	8(up), %rdx
+	mulx(	%rdx, %rax, %rbx)
+	mov	16(up), %rdx
+	mulx(	%rdx, %rsi, %rdx)
+	adcx(	w2, w2)
+	adcx(	w0, w0)
+	adcx(	w1, w1)
+	adcx(	w3, w3)
+	adox(	w2, %rcx)
+	adox(	w0, %rax)
+	adox(	w1, %rbx)
+	adox(	w3, %rsi)
+	mov	$0, R32(%r8)
+	adox(	%r8, %rdx)
+	adcx(	%r8, %rdx)
+	mov	%rcx, 8(rp)
+	mov	%rax, 16(rp)
+	mov	%rbx, 24(rp)
+	mov	%rsi, 32(rp)
+	mov	%rdx, 40(rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(gt3):	push	%rbx
+
+	lea	-3(un_param), R32(un_save)
+	lea	5(un_param), R32(n)
+	mov	R32(un_param), R32(%rax)
+	and	$-8, R32(un_save)
+	shr	$3, R32(n)		C count for mul_1 loop
+	neg	un_save			C 8*count and offert for addmul_1 loops
+	and	$7, R32(%rax)		C clear CF for adc as side-effect
+
+	mov	(up), u0
+
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	lea	64(up), up
+	add	w1, w2
+	jmp	L(mb0)
+
+L(mf3):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mov	w2, (rp)
+	mulx(	8,(up), w0, w1)
+	lea	24(up), up
+	lea	24(rp), rp
+	add	w3, w0
+	jmp	L(mb3)
+
+L(mf4):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	32(up), up
+	lea	32(rp), rp
+	add	w1, w2
+	jmp	L(mb4)
+
+L(mf5):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	40(up), up
+	lea	40(rp), rp
+	add	w3, w0
+	jmp	L(mb5)
+
+L(mf6):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	48(up), up
+	lea	48(rp), rp
+	add	w1, w2
+	jmp	L(mb6)
+
+L(mf7):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	56(up), up
+	lea	56(rp), rp
+	add	w3, w0
+	jmp	L(mb7)
+
+L(mf1):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	8(up), up
+	lea	8(rp), rp
+	add	w3, w0
+	jmp	L(mb1)
+
+L(mf2):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	16(up), up
+	lea	16(rp), rp
+	dec	R32(n)
+	add	w1, w2
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(top):	mov	w2, -8(rp)
+	adc	w3, w0
+L(mb1):	mulx(	8,(up), w2, w3)
+	adc	w1, w2
+	lea	64(up), up
+L(mb0):	mov	w0, (rp)
+	mov	w2, 8(rp)
+	mulx(	-48,(up), w0, w1)
+	lea	64(rp), rp
+	adc	w3, w0
+L(mb7):	mulx(	-40,(up), w2, w3)
+	mov	w0, -48(rp)
+	adc	w1, w2
+L(mb6):	mov	w2, -40(rp)
+	mulx(	-32,(up), w0, w1)
+	adc	w3, w0
+L(mb5):	mulx(	-24,(up), w2, w3)
+	mov	w0, -32(rp)
+	adc	w1, w2
+L(mb4):	mulx(	-16,(up), w0, w1)
+	mov	w2, -24(rp)
+	adc	w3, w0
+L(mb3):	mulx(	-8,(up), w2, w3)
+	adc	w1, w2
+	mov	w0, -16(rp)
+	dec	R32(n)
+	mulx(	(up), w0, w1)
+	jnz	L(top)
+
+L(end):	mov	w2, -8(rp)
+	adc	w3, w0
+C	mov	w0, (rp)
+C	adc	%rcx, w1
+C	mov	w1, 8(rp)
+
+	lea	L(atab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r11
+	lea	(%r11, %r10), %r11
+',`
+	mov	(%r10,%rax,8), %r11
+')
+	mov	$63, R32(%rax)
+	jmp	*%r11
+
+L(ed0):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f7):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	(up), w1		C up[-1]
+	mov	8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	jmp	L(b7)
+
+	ALIGN(16)
+L(tp0):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed0)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+L(b0):	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp0)
+
+L(ed1):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f0):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-8(up), w3		C up[-1]
+	mov	(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	jmp	L(b0)
+
+	ALIGN(16)
+L(tp1):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed1)
+L(b1):	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp1)
+
+L(ed2):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f1):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	8(un_save), un_save
+	lea	-56(rp,un_save,8), rp
+	mov	-16(up), w1		C up[-1]
+	mov	-8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)			C FIXME: crossjump?
+	mulx(	(up), w0, w1)
+	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jmp	L(b1)
+
+	ALIGN(16)
+L(tp2):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed2)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+L(b2):	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp2)
+
+L(ed3):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f2):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	or	R32(un_save), R32(n)
+	jz	L(cor3)
+	lea	-56(rp,un_save,8), rp
+	mov	-24(up), w3		C up[-1]
+	mov	-16(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	jmp	L(b2)
+
+	ALIGN(16)
+L(tp3):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed3)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+L(b3):	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp3)
+
+L(ed4):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f3):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-32(up), w1		C up[-1]
+	mov	-24(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	jmp	L(b3)
+
+	ALIGN(16)
+L(tp4):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed4)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+L(b4):	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp4)
+
+L(ed5):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f4):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-40(up), w3		C up[-1]
+	mov	-32(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	jmp	L(b4)
+
+	ALIGN(16)
+L(tp5):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed5)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+L(b5):	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp5)
+
+L(ed6):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f5):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-48(up), w1		C up[-1]
+	mov	-40(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	jmp	L(b5)
+
+	ALIGN(16)
+L(tp6):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed6)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+L(b6):	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp6)
+
+L(ed7):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f6):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-56(up), w3		C up[-1]
+	mov	-48(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	mulx(	-40,(up), w2, w3)
+	jmp	L(b6)
+
+	ALIGN(16)
+L(tp7):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed7)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+L(b7):	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp7)
+
+L(cor3):lea	-64(rp), rp
+	mov	-24(up), w3		C up[-1]
+	mov	-16(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	adox(	56,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 56(rp)
+	adcx(	w1, w2)
+	mulx(	(up), %rbx, w1)
+	adox(	64,(rp), w2)
+	adcx(	w3, %rbx)
+	mov	w2, 64(rp)
+	adox(	72,(rp), %rbx)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 80(rp)	C FIXME
+C wd2
+	mov	-16(up), w1		C up[-1]
+	mov	-8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	mulx(	(up), w0, %rax)
+	adox(	%rbx, w2)
+	adcx(	w3, w0)
+	mov	w2, 72(rp)
+	adox(	80,(rp), w0)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	mov	w0, 80(rp)
+	adc	%rcx, %rax		C relies on rcx = 0
+C wd1
+	mov	-8(up), w3		C up[-1]
+	mov	(up), u0		C up[0]
+	sar	$63, w3
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	adcx(	w3, w0)
+	adox(	%rax, w0)
+	mov	w0, 88(rp)
+	adcx(	%rcx, w1)
+	adox(	%rcx, w1)
+	mov	w1, 96(rp)
+
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf7), L(mtab))
+	JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
+L(atab):JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	TEXT
+EPILOGUE()