aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/coreinhm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreinhm')
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm200
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h238
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm196
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm182
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm549
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm37
7 files changed, 1592 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm
new file mode 100644
index 0000000..eed64e7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm
@@ -0,0 +1,200 @@
+dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl Optimised for Nehalem.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 4.75
+C Intel P4 ?
+C Intel core2 2.8-3
+C Intel NHM 2.8
+C Intel SBR 3.55
+C Intel atom ?
+C VIA nano ?
+
+C The inner-loop probably runs close to optimally on Nehalem (using 4-way
+C unrolling). The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+
+ lea -8(up,%rax,8), up
+ lea -8(vp,%rax,8), vp
+ lea -40(rp,%rax,8), rp
+ neg %rax
+
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor R32(%r9), R32(%r9)
+ mov 8(vp,%rax,8), %r10
+ mov 16(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r9
+ mov 24(vp,%rax,8), %r8
+ ADCSBB 16(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $3, %rax
+ jmp L(lo3)
+
+L(b0): mov 8(vp,%rax,8), %r9
+ xor R32(%r8), R32(%r8)
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ jmp L(lo0)
+
+L(b1): mov 8(vp,%rax,8), %r8
+ add $1, %rax
+ jz L(1)
+ mov 8(vp,%rax,8), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo1)
+L(1): xor R32(%r11), R32(%r11)
+ jmp L(wd1)
+
+L(b2): xor %r10, %r10
+ mov 8(vp,%rax,8), %r11
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ mov 16(vp,%rax,8), %r8
+ ADCSBB 8(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $2, %rax
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov 8(vp,%rax,8), %r9
+ mov %r11, %rbp
+L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
+L(lo1): shrd %cl, %r8, %rbp
+ shrd %cl, %r9, %r8
+ mov 16(vp,%rax,8), %r10
+ mov 24(vp,%rax,8), %r11
+ shrd %cl, %r10, %r9
+ shrd %cl, %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %rbp
+ ADCSBB 8(up,%rax,8), %r8
+ mov %r8, 40(rp,%rax,8) C offset 40
+ ADCSBB 16(up,%rax,8), %r9
+ mov 32(vp,%rax,8), %r8
+ ADCSBB 24(up,%rax,8), %r10
+ sbb R32(%rbx), R32(%rbx)
+ add $4, %rax
+ mov %rbp, (rp,%rax,8) C offset 32
+L(lo0):
+L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
+ jnz L(top)
+
+L(end): mov %r10, 24(rp,%rax,8)
+L(wd1): shrd %cl, %r8, %r11
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up,%rax,8), %r11
+ mov %r11, 32(rp,%rax,8) C offset 32
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r8
+ ADDSUB %r8, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm
new file mode 100644
index 0000000..1be829f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.0
+C AMD K10 4.0
+C AMD bull 5.0
+C AMD pile 4.84 5.39
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56
+C AMD jaguar 5.30
+C Intel P4 15.7 17.2
+C Intel core2 5.15
+C Intel NHM 4.56
+C Intel SBR 3.44
+C Intel HWL 3.03
+C Intel BWL 2.77
+C Intel SKL 2.76
+C Intel atom 21
+C Intel SLM 11
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C N.B.: Be careful if editing, making sure the loop alignment padding does not
+C become large, as we currently fall into it.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ mov (rp), %r8
+ lea -8(rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ mov $0, R32(%r11)
+ mov %r8, %r10
+ ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc %rdx, %r11
+ jmp L(lo0)
+
+L(b10): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %r8, %r10
+ mov $0, R32(%r11)
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc %rdx, %r11
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n_param)
+ jz L(b01)
+
+L(b11): mov $2, R32(n)
+ sub n_param, n
+ mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ jmp L(lo3)
+
+L(b01): mov $0, R32(n)
+ sub n_param, n
+ xor %r11, %r11
+ add $4, n
+ jc L(end)
+
+ ALIGN(32)
+L(top): mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov -16(up,n,8), %rax
+ adc %rdx, %r9
+L(lo1): mul v0
+ ADDSUB %r11, %r8
+ mov $0, R32(%r11)
+ mov -16(rp,n,8), %r10
+ adc $0, %r9
+ ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc %rdx, %r11
+ mov %r8, -24(rp,n,8)
+ ADDSUB %r9, %r10
+ adc $0, %r11
+L(lo0): mov -8(rp,n,8), %r8
+ mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%r9)
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(rp,n,8)
+ ADDSUB %r11, %r8
+ adc $0, %r9
+L(lo3): mul v0
+ mov (rp,n,8), %r10
+ mov $0, R32(%r11)
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc %rdx, %r11
+ mov %r8, -8(rp,n,8)
+ ADDSUB %r9, %r10
+ adc $0, %r11
+L(lo2): mov 8(rp,n,8), %r8
+ mov %r10, (rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ ADDSUB %rax, %r8
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ ADDSUB %r11, %r8
+ adc $0, %rax
+ mov %r8, (rp)
+
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..f56c128
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h
@@ -0,0 +1,238 @@
+/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 468,424,931 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 10
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 17
+
+#define DIV_1_VS_MUL_1_PERCENT 301
+
+#define MUL_TOOM22_THRESHOLD 18
+#define MUL_TOOM33_THRESHOLD 59
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 98
+#define SQR_TOOM4_THRESHOLD 250
+#define SQR_TOOM6_THRESHOLD 351
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 28
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31, 8}, { 511,10}, \
+ { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 639,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
+ { 2431,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \
+ { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1663,13}, { 3455,14}, \
+ { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,15}, { 1791,14}, { 3839,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,17}, { 1023,16}, \
+ { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \
+ { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 204
+#define MUL_FFT_THRESHOLD 4224
+
+#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \
+ { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \
+ { 79, 9}, { 319, 6}, { 2687, 7}, { 1407, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \
+ { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \
+ { 319,12}, { 95,11}, { 191,10}, { 383,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,11}, \
+ { 303,10}, { 607,11}, { 319,10}, { 639,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,10}, { 895,11}, { 479,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 671,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,12}, { 447,11}, \
+ { 895,12}, { 479,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \
+ { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \
+ { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \
+ { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6655,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 218
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 49
+#define MULLO_MUL_N_THRESHOLD 8397
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 11
+#define SQRLO_SQR_THRESHOLD 7035
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 151
+#define DC_BDIV_QR_THRESHOLD 40
+#define DC_BDIV_Q_THRESHOLD 30
+
+#define INV_MULMOD_BNM1_THRESHOLD 34
+#define INV_NEWTON_THRESHOLD 199
+#define INV_APPR_THRESHOLD 157
+
+#define BINV_NEWTON_THRESHOLD 254
+#define REDC_1_TO_REDC_N_THRESHOLD 48
+
+#define MU_DIV_QR_THRESHOLD 1334
+#define MU_DIVAPPR_Q_THRESHOLD 1334
+#define MUPI_DIV_QR_THRESHOLD 83
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1308
+
+#define POWM_SEC_TABLE 1,64,66,452,1486
+
+#define GET_STR_DC_THRESHOLD 11
+#define GET_STR_PRECOMPUTE_THRESHOLD 18
+#define SET_STR_DC_THRESHOLD 141
+#define SET_STR_PRECOMPUTE_THRESHOLD 1023
+
+#define FAC_DSC_THRESHOLD 182
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 5 /* 2.91% faster than 3 */
+#define HGCD_THRESHOLD 116
+#define HGCD_APPR_THRESHOLD 164
+#define HGCD_REDUCE_THRESHOLD 2205
+#define GCD_DC_THRESHOLD 321
+#define GCDEXT_DC_THRESHOLD 358
+#define JACOBI_BASE_METHOD 4 /* 0.12% faster than 1 */
+
+/* Tuneup completed successfully, took 452116 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm
new file mode 100644
index 0000000..a5a63e4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm
@@ -0,0 +1,196 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 3.26
+C AMD bd1 4.2
+C AMD bd2 4.2
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 1.15
+C AMD bobcat 7.29
+C AMD jaguar 2.53
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 2.03
+C Intel SBR 1.66
+C Intel IBR 1.62
+C Intel HWL 1.50
+C Intel BWL 1.50
+C Intel SKL 1.50
+C Intel atom n/a
+C Intel SLM 2.55
+C VIA nano n/a
+
+C TODO
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening. More work could be done there.
+C * Combine the accumulators rax and rcx into one register to save some
+C bookkeeping and a push/pop pair. Unfortunately this cause a slight
+C slowdown for at leat NHM and SBR.
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`sum', `lea ($1,$2), $2')
+define(`sum', `add $1, $2')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ push %rbx
+ push %rbp
+
+ mov (up), %r10
+ xor (vp), %r10
+
+ mov R32(n), R32(%r8)
+ and $3, R32(%r8)
+
+ xor R32(%rcx), R32(%rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): mov 8(up), %r10
+ mov 16(up), %r11
+ xor 8(vp), %r10
+ xor 16(vp), %r11
+ xor R32(%rbp), R32(%rbp)
+ sub $4, n
+ jle L(x3)
+ mov 24(up), %r8
+ mov 32(up), %r9
+ add $24, up
+ add $24, vp
+ jmp L(e3)
+
+L(0): mov 8(up), %r9
+ xor 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, up
+ add $32, vp
+ sub $4, n
+ jle L(x4)
+
+ ALIGN(16)
+L(top):
+L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ mov (up), %r8
+ mov 8(up), %r9
+ sum( %rbx, %rax)
+L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ xor (vp), %r8
+ xor 8(vp), %r9
+ sum( %rbp, %rcx)
+L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ add $32, up
+ sum( %rbx, %rax)
+L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx
+ xor 16(vp), %r10
+ xor 24(vp), %r11
+ add $32, vp
+ sum( %rbp, %rcx)
+ sub $4, n
+ jg L(top)
+
+L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp
+ sum( %rbx, %rax)
+L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx
+ sum( %rbp, %rcx)
+ .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp
+ sum( %rbx, %rax)
+ sum( %rbp, %rcx)
+L(x2): add %rcx, %rax
+L(x1): pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(2): mov 8(up), %r11
+ xor 8(vp), %r11
+ sub $2, n
+ jle L(n2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ xor R32(%rbx), R32(%rbx)
+ xor 16(vp), %r8
+ xor 24(vp), %r9
+ add $16, up
+ add $16, vp
+ jmp L(e2)
+L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx
+ jmp L(x2)
+
+L(1): dec n
+ jle L(x1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ xor 8(vp), %r8
+ xor 16(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ mov 24(up), %r10
+ mov 32(up), %r11
+ add $40, up
+ add $8, vp
+ jmp L(e1)
+
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm
new file mode 100644
index 0000000..0a3c867
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm
@@ -0,0 +1,182 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 1.39
+C AMD bd1 4
+C AMD bd2 4
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 0.72
+C AMD bobcat 5.78
+C AMD jaguar 1.27
+C Intel P4 n/a
+C Intel core2 n/a
+C Intel NHM 1.04
+C Intel SBR 1.02
+C Intel IBR 1.0
+C Intel HWL 1.0
+C Intel BWL 1.0
+C Intel SKL 1.0
+C Intel atom n/a
+C Intel SLM 1.34
+C VIA nano n/a
+
+C TODO
+C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would
+C not cause any additional feed-in overhead as we already use a jump table.
+C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C Intel hardware. Perhaps mix such a loop with popcnt instructions.
+C * The random placement of the L0, L1, L2, etc blocks are due to branch
+C shortening.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(2)
+
+ mov R32(n), R32(%r8)
+ and $7, R32(%r8)
+
+ .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax
+ xor R32(%rcx), R32(%rcx)
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%r8,4), %r8
+ add %r9, %r8
+ jmp *%r8
+',`
+ jmp *(%r9,%r8,8)
+')
+
+L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11
+ add $24, up
+ sub $8, n
+ jg L(e34)
+ add %r10, %rax
+ add %r11, %rax
+L(s1): FUNC_EXIT()
+ ret
+
+L(1): sub $8, n
+ jle L(s1)
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9
+ add $8, up
+ jmp L(e12)
+
+L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11
+ add $-8, up
+ jmp L(e07)
+
+L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ jmp L(e07)
+
+L(4): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ add $32, up
+ sub $8, n
+ jle L(x4)
+
+ ALIGN(16)
+L(top):
+L(e34): .byte 0xf3,0x4c,0x0f,0xb8,0x07 C popcnt (%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%r9
+ add %r10, %rcx
+ add %r11, %rax
+L(e12): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11
+ add %r8, %rcx
+ add %r9, %rax
+L(e07): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 C popcnt 0x20(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 C popcnt 0x28(%rdi),%r9
+ add %r10, %rcx
+ add %r11, %rax
+L(e56): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 C popcnt 0x30(%rdi),%r10
+ .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 C popcnt 0x38(%rdi),%r11
+ add $64, up
+ add %r8, %rcx
+ add %r9, %rax
+ sub $8, n
+ jg L(top)
+
+L(x4): add %r10, %rcx
+ add %r11, %rax
+L(x2): add %rcx, %rax
+
+ FUNC_EXIT()
+ ret
+
+L(2): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ sub $8, n
+ jle L(x2)
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9
+ add $16, up
+ jmp L(e12)
+
+L(5): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 0x8(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 0x10(%rdi),%r9
+ add $-24, up
+ jmp L(e56)
+
+L(6): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx
+ .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8
+ .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9
+ add $-16, up
+ jmp L(e56)
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm
new file mode 100644
index 0000000..fc71c1b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm
@@ -0,0 +1,549 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 3(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov $0, R32(%r11)
+ mov 8(up,n,8), %rbx
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r9
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+L(e1): add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 5(n), i
+ mov (mp,n,8), %rax
+ mov (up,n,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ mov 8(up,n,8), %rbx
+ mov $0, R32(%r11)
+ add %rax, %rbx
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r11
+ add %r9, %rbx
+ adc $0, %r11
+ mov 16(up,n,8), %rbp
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+C jmp L(tp3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp3)
+
+L(ed3): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov $0, R32(%r11)
+ mov (up,n,8), %r10
+ add %rax, %r10
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r11
+ mov 8(up,n,8), %rbx
+ mul q0
+ add %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r9
+ add %r11, %rbx
+ adc $0, %r9
+ mul q0
+ mov 16(up,n,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r11
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+L(e0): add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp0)
+
+L(ed0): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,n,8), %rax
+ adc %rdx, %r11
+ mov 8(up,n,8), %rbx
+ mul q0
+ add %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(mp,n,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbx
+ mov $0, R32(%r11)
+ mov 16(up,n,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov 24(mp,n,8), %rax
+ adc %rdx, %r11
+ mov %rbx, 8(up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov -16(mp,i,8), %rax
+ adc %rdx, %r9
+ mul q0
+ add %r11, %rbp
+ mov $0, R32(%r11)
+ mov -16(up,i,8), %r10
+ adc $0, %r9
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -24(up,i,8)
+L(e2): add %r9, %r10
+ adc $0, %r11
+ mov -8(up,i,8), %rbp
+ mul q0
+ add %rax, %rbp
+ mov $0, R32(%r9)
+ mov (mp,i,8), %rax
+ adc %rdx, %r9
+ mov %r10, -16(up,i,8)
+ add %r11, %rbp
+ adc $0, %r9
+ mul q0
+ mov (up,i,8), %r10
+ mov $0, R32(%r11)
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc %rdx, %r11
+ mov %rbp, -8(up,i,8)
+ add %r9, %r10
+ adc $0, %r11
+ mov 8(up,i,8), %rbp
+ mov %r10, (up,i,8)
+ add $4, i
+ jnc L(tp2)
+
+L(ed2): mul q0
+ add %rax, %rbp
+ adc $0, %rdx
+ add %r11, %rbp
+ adc $0, %rdx
+ mov %rbp, I(-8(up),-24(up,i,8))
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')