aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/coreisbr
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreisbr')
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm224
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm54
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm56
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm173
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm215
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm203
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm212
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm174
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm200
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h241
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm199
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm167
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm407
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm384
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm546
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm193
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm484
23 files changed, 4354 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm
new file mode 100644
index 0000000..21f0bf4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm
@@ -0,0 +1,224 @@
+dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.93 this
+C Intel IBR 2.66 this
+C Intel HWL 2.5 2.15
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+
+ mov n_param, n
+ neg n
+
+ lea (up,n_param,8), up
+ lea 8(rp,n_param,8), rp
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): mov -8(rp,n,8), X0
+ mov %rdx, w1
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ xor w0, w0
+ xor w3, w3
+ test $2, R8(n)
+ jnz L(b10)
+
+L(b00): nop C this nop make loop go faster on SBR!
+ mul v1
+ mov (rp,n,8), X1
+ jmp L(lo0)
+
+L(b10): lea -2(n), n
+ jmp L(lo2)
+
+L(bx1): mov -8(rp,n,8), X1
+ mov %rdx, w3
+ add %rax, X1
+ adc $0, w3
+ mov (up,n,8), %rax
+ xor w1, w1
+ xor w2, w2
+ test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov (rp,n,8), X0
+ inc n
+ jmp L(lo1)
+
+L(b11): dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo1): mul v1
+ mov %rdx, w0 C 1
+ add %rax, X0 C 0
+ adc $0, w0 C 1
+ add w1, X1 C 3
+ adc $0, w3 C 0
+ add w2, X0 C 0
+ adc $0, w0 C 1
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, X0 C 0
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mov (up,n,8), %rax
+ mul v1
+ mov X1, -16(rp,n,8) C 3
+ mov (rp,n,8), X1 C 1
+ add w3, X0 C 0
+ adc $0, w1 C 1
+L(lo0): mov %rdx, w2 C 2
+ mov X0, -8(rp,n,8) C 0
+ add %rax, X1 C 1
+ adc $0, w2 C 2
+ mov 8(up,n,8), %rax
+ add w0, X1 C 1
+ adc $0, w2 C 2
+ mul v0
+ add %rax, X1 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+L(lo3): mul v1
+ add w1, X1 C 1
+ mov 8(rp,n,8), X0 C 2
+ adc $0, w3 C 2
+ mov %rdx, w0 C 3
+ add %rax, X0 C 2
+ adc $0, w0 C 3
+ mov 16(up,n,8), %rax
+ mul v0
+ add w2, X0 C 2
+ mov X1, (rp,n,8) C 1
+ mov %rdx, w1 C 3
+ adc $0, w0 C 3
+ add %rax, X0 C 2
+ adc $0, w1 C 3
+ mov 16(up,n,8), %rax
+ add w3, X0 C 2
+ adc $0, w1 C 3
+L(lo2): mul v1
+ mov 16(rp,n,8), X1 C 3
+ add %rax, X1 C 3
+ mov %rdx, w2 C 4
+ adc $0, w2 C 4
+ mov 24(up,n,8), %rax
+ mov X0, 8(rp,n,8) C 2
+ mul v0
+ add w0, X1 C 3
+ mov %rdx, w3 C 4
+ adc $0, w2 C 4
+ add %rax, X1 C 3
+ mov 24(up,n,8), %rax
+ mov 24(rp,n,8), X0 C 0 useless but harmless final read
+ adc $0, w3 C 4
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-16(rp),-16(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I(-8(rp),-8(rp,n,8))
+ mov %rdx, %rax
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm
new file mode 100644
index 0000000..2319a80
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm
@@ -0,0 +1,54 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh1_n)
+ define(func_nc, mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh1_n)
+ define(func_nc, mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm
new file mode 100644
index 0000000..3b7bb22
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm
@@ -0,0 +1,56 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh2_n)
+ define(func_nc, mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh2_n)
+ define(func_nc, mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh2_nc
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm
new file mode 100644
index 0000000..23ace41
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm
@@ -0,0 +1,173 @@
+dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
+dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
+
+dnl Copyright 2009-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3.25
+C Intel NHM 4
+C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32))
+C Intel atom ?
+C VIA nano ?
+
+C This code probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling). It also runs reasonably well on Core 2, but it runs poorly on
+C all other processors, including Nehalem.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ mov cy, %rax
+ neg %rax C set msb on carry
+ xor R32(%rbp), R32(%rbp) C limb carry
+ mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+ mov R32(n), R32(%r9)
+ and $3, R32(%r9)
+ je L(b00)
+ cmp $2, R32(%r9)
+ jc L(b01)
+ je L(b10)
+ jmp L(b11)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ xor R32(%rbp), R32(%rbp) C limb carry
+ mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ mov 16(vp), %r10
+ shrd $RSH, %r10, %r9
+ add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+ mov %r10, %rbp
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $3, n
+ ja L(top)
+ jmp L(end)
+
+L(b01): add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ mov %rbp, (rp)
+ mov %r8, %rbp
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $1, n
+ ja L(top)
+ jmp L(end)
+
+L(b10): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ add R32(%rax), R32(%rax) C init carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, %rbp
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $2, n
+ ja L(top)
+ jmp L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ shrd $RSH, %r8, %rbp
+L(b00): mov 8(vp), %r9
+ shrd $RSH, %r9, %r8
+ mov 16(vp), %r10
+ shrd $RSH, %r10, %r9
+ mov 24(vp), %r11
+ shrd $RSH, %r11, %r10
+ lea 32(vp), vp
+ add R32(%rax), R32(%rax) C restore carry flag
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ mov %rbp, (rp)
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+ mov %r10, 24(rp)
+ mov %r11, %rbp
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry flag
+ sub $4, n
+ jnz L(top)
+
+L(end): shr $RSH, %rbp
+ add R32(%rax), R32(%rax) C restore carry flag
+ ADCSBB $0, %rbp
+ mov %rbp, %rax
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm
new file mode 100644
index 0000000..db8ee68
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm
@@ -0,0 +1,215 @@
+dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl Optimised for Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 5.25
+C Intel P4 ?
+C Intel core2 3.1
+C Intel NHM 3.95
+C Intel SBR 2.75
+C Intel atom ?
+C VIA nano ?
+
+C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling). The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+define(`cy', `%r9') C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(IFRSB, )
+ define(func_n, mpn_addlsh_n)
+ define(func_nc, mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(IFRSB, `$1')
+ define(func_n, mpn_rsblsh_n)
+ define(func_nc, mpn_rsblsh_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+ push %rbx
+ xor R32(%rbx), R32(%rbx) C clear CF save register
+L(ent): push %rbp
+ mov R32(n), R32(%rbp)
+ mov n, %rax
+ mov R32(cnt), R32(%rcx)
+ neg R32(%rcx)
+ and $3, R32(%rbp)
+ jz L(b0)
+ lea -32(vp,%rbp,8), vp
+ lea -32(up,%rbp,8), up
+ lea -32(rp,%rbp,8), rp
+ cmp $2, R32(%rbp)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): xor %r8, %r8
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $3, %rax
+ jz L(3)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo3)
+L(3): add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd3)
+
+L(b0): mov (vp), %r8
+ mov 8(vp), %r9
+ xor R32(%rbp), R32(%rbp)
+ jmp L(lo0)
+
+L(b1): xor %r10, %r10
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $1, %rax
+ jz L(1)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ mov (vp), %r8
+ jmp L(lo1)
+L(1): add R32(%rbx), R32(%rbx)
+ ADCSBB 24(up), %r10
+ jmp L(wd1)
+
+L(b2): xor %r9, %r9
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ shrd R8(%rcx), %r11, %r10
+ sub $2, %rax
+ jz L(2)
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ lea 32(up), up
+ jmp L(lo2)
+L(2): add R32(%rbx), R32(%rbx)
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ jmp L(wd2)
+
+ ALIGN(32) C 16-byte alignment is not enough!
+L(top): shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ lea 32(vp), vp
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+ lea 32(up), up
+L(lo3): mov %r8, 8(rp)
+L(lo2): mov %r9, 16(rp)
+ mov (vp), %r8
+L(lo1): mov %r10, 24(rp)
+ mov 8(vp), %r9
+ mov %r11, %rbp
+ lea 32(rp), rp
+ sbb R32(%rbx), R32(%rbx)
+L(lo0): shrd R8(%rcx), %r8, %rbp
+ mov 16(vp), %r10
+ shrd R8(%rcx), %r9, %r8
+ shrd R8(%rcx), %r10, %r9
+ mov 24(vp), %r11
+ sub $4, %rax
+ jg L(top)
+
+ shrd R8(%rcx), %r11, %r10
+ add R32(%rbx), R32(%rbx)
+ ADCSBB (up), %rbp
+ ADCSBB 8(up), %r8
+ ADCSBB 16(up), %r9
+ ADCSBB 24(up), %r10
+ mov %rbp, (rp)
+L(wd3): mov %r8, 8(rp)
+L(wd2): mov %r9, 16(rp)
+L(wd1): mov %r10, 24(rp)
+ adc R32(%rax), R32(%rax) C rax is zero after loop
+ shr R8(%rcx), %r11
+ ADDSUB %r11, %rax
+IFRSB( neg %rax)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ') C cnt
+IFDOS(` mov 64(%rsp), %r9 ') C cy
+ push %rbx
+ neg cy
+ sbb R32(%rbx), R32(%rbx) C initialise CF save register
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm
new file mode 100644
index 0000000..61fee3e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm
@@ -0,0 +1,203 @@
+dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
+dnl Haswell.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1.75\2.52
+C AMD K10 1.5
+C AMD bd1 1.69\2.25
+C AMD bd2 1.65
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen 1.5
+C AMD bt1 2.67
+C AMD bt2 2.16
+C Intel P4 11.54
+C Intel PNR 5
+C Intel NHM 5.5
+C Intel SBR 1.54
+C Intel IBR 1.5
+C Intel HWL 1.32
+C Intel BWL 1.07
+C Intel SKL 1.21
+C Intel atom 4.3
+C Intel SLM 3
+C VIA nano ?
+
+C The loop of this code was manually written. It runs close to optimally on
+C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
+C It also runs slightly faster on average on AMD bd1 and bd2.
+C
+C No micro-optimisation has been done.
+C
+C N.B.! The loop alignment padding insns are executed. If editing the code,
+C make sure the padding does not become excessive. It is now a 4-byte nop.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+
+L(ent): mov R32(n), R32(%rax)
+ shr $2, n
+
+ test $1, R8(%rax)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(%rax)
+ jnz L(b10)
+
+L(b00): neg %r8
+ mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+ lea -16(rp), rp
+ jmp L(lo0)
+
+L(b10): neg %r8
+ mov (up), %r10
+ mov 8(up), %r11
+ ADCSBB 0(vp), %r10
+ ADCSBB 8(vp), %r11
+ jrcxz L(e2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ lea 16(up), up
+ ADCSBB 16(vp), %r8
+ ADCSBB 24(vp), %r9
+ lea 16(vp), vp
+C lea (rp), rp
+ jmp L(lo2)
+
+L(e2): mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+L(bx1): test $2, R8(%rax)
+ jnz L(b11)
+
+L(b01): neg %r8
+ mov (up), %r11
+ ADCSBB (vp), %r11
+ jrcxz L(e1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ lea 8(up), up
+ lea -8(rp), rp
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ lea 8(vp), vp
+ jmp L(lo1)
+
+L(e1): mov %r11, (rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+L(b11): neg %r8
+ mov (up), %r9
+ ADCSBB (vp), %r9
+ mov 8(up), %r10
+ mov 16(up), %r11
+ lea 24(up), up
+ ADCSBB 8(vp), %r10
+ ADCSBB 16(vp), %r11
+ lea 24(vp), vp
+ mov %r9, (rp)
+ lea 8(rp), rp
+ jrcxz L(end)
+
+ ALIGN(32)
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+L(lo2): mov %r10, (rp)
+L(lo1): mov %r11, 8(rp)
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+L(lo0): mov %r8, 16(rp)
+L(lo3): mov %r9, 24(rp)
+ lea 32(rp), rp
+ dec n
+ jnz L(top)
+
+L(end): mov R32(n), R32(%rax) C zero rax
+ mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm
new file mode 100644
index 0000000..b4c1572
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm
@@ -0,0 +1,212 @@
+dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.27
+C AMD K10 4.27 4.54
+C AMD bull 4.76
+C AMD pile 4.55
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.30
+C AMD jaguar 5.28
+C Intel P4 16.2 17.1
+C Intel core2 5.26
+C Intel NHM 5.09
+C Intel SBR 3.21
+C Intel IBR 2.96
+C Intel HWL 2.81
+C Intel BWL 2.76
+C Intel SKL 2.76
+C Intel atom 21.5
+C Intel SLM 9.5
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjörn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+define(`I',`$1')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'')') dnl
+IFDOS(` define(`rp', ``%rcx'')') dnl
+IFDOS(` define(`v0', ``%r9'')') dnl
+IFDOS(` define(`r9', ``rdi'')') dnl
+IFDOS(` define(`n_param',``%r8'')') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax
+ push %rbx
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(b13)
+
+L(b02): xor R32(%r11), R32(%r11)
+ test $2, R8(n_param)
+ jnz L(b2)
+
+L(b0): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rdx, %r9
+ mov -8(rp,n,8), %r8
+ jmp L(e0)
+
+ ALIGN(16)
+L(b2): mov $-1, n
+ sub n_param, n
+ mul v0
+ mov 8(rp,n,8), %r8
+ mov %rdx, %r9
+ jmp L(e2)
+
+ ALIGN(16)
+L(b13): xor R32(%r9), R32(%r9)
+ test $2, R8(n_param)
+ jnz L(b3)
+
+L(b1): mov $2, R32(n)
+ sub n_param, n
+ jns L(1)
+ mul v0
+ mov -16(rp,n,8), %r10
+ mov %rdx, %r11
+ jmp L(e1)
+
+ ALIGN(16)
+L(b3): xor R32(n), R32(n)
+ sub n_param, n
+ mul v0
+ mov (rp,n,8), %r10
+ jmp L(e3)
+
+ ALIGN(32)
+L(top): mul v0
+ mov -16(rp,n,8), %r10
+ ADDSUB %r11, %r8
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %r8, -24(rp,n,8)
+L(e1): ADDSUB %rax, %r10
+ mov -8(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ ADDSUB %r9, %r10
+ mov %rdx, %r9
+ mov -8(rp,n,8), %r8
+ adc $0, %r11
+ mov %r10, -16(rp,n,8)
+L(e0): ADDSUB %rax, %r8
+ adc $0, %r9
+ mov (up,n,8), %rax
+ mul v0
+ mov (rp,n,8), %r10
+ ADDSUB %r11, %r8
+ mov %r8, -8(rp,n,8)
+ adc $0, %r9
+L(e3): mov %rdx, %r11
+ ADDSUB %rax, %r10
+ mov 8(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ mov 8(rp,n,8), %r8
+ ADDSUB %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (rp,n,8)
+ adc $0, %r11
+L(e2): ADDSUB %rax, %r8
+ adc $0, %r9
+ mov 16(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ mov I(-8(rp),-16(rp,n,8)), %r10
+ ADDSUB %r11, %r8
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %r8, I(-16(rp),-24(rp,n,8))
+ ADDSUB %rax, %r10
+ adc $0, %r11
+ ADDSUB %r9, %r10
+ adc $0, %r11
+ mov %r10, I(-8(rp),-16(rp,n,8))
+ mov %r11, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+
+ ALIGN(16)
+L(1): mul v0
+ ADDSUB %rax, -8(rp)
+ mov %rdx, %rax
+ adc $0, %rax
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm
new file mode 100644
index 0000000..43abcc8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm
@@ -0,0 +1,174 @@
+dnl AMD64 mpn_cnd_add_n.
+
+dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR 3.0
+C Intel NHM 3.75
+C Intel SBR 1.93
+C Intel IBR 1.89
+C Intel HWL 1.78
+C Intel BWL 1.50
+C Intel SKL 1.50
+C Intel atom
+C Intel SLM 4.0
+C VIA nano
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about whether it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi') dnl rcx
+define(`rp', `%rsi') dnl rdx
+define(`up', `%rdx') dnl r8
+define(`vp', `%rcx') dnl r9
+define(`n', `%r8') dnl rsp+40
+
+define(`cnd', `%rbx')
+
+define(ADDSUB, add)
+define(ADCSBB, adc)
+define(func, mpn_cnd_add_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), R32(%r8)')
+ push %rbx
+
+ neg cnd_arg
+ sbb cnd, cnd C make cnd mask
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): test $2, R8(n)
+ jz L(b1)
+
+L(b3): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ and cnd, %rdi
+ and cnd, %r9
+ and cnd, %r10
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sub $3, n
+ jnz L(top)
+ jmp L(end)
+
+L(x0): xor R32(%rax), R32(%rax)
+ test $2, R8(n)
+ jz L(top)
+
+L(b2): mov (vp), %rdi
+ mov 8(vp), %r9
+ and cnd, %rdi
+ and cnd, %r9
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sub $2, n
+ jnz L(top)
+ jmp L(end)
+
+L(b1): mov (vp), %rdi
+ and cnd, %rdi
+ ADDSUB (up), %rdi
+ mov %rdi, (rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ dec n
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ mov 24(vp), %r11
+ lea 32(vp), vp
+ and cnd, %rdi
+ and cnd, %r9
+ and cnd, %r10
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ ADCSBB (up), %rdi
+ mov %rdi, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ lea 32(up), up
+ mov %r11, 24(rp)
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry
+ sub $4, n
+ jnz L(top)
+
+L(end): neg R32(%rax)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm
new file mode 100644
index 0000000..f55492b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm
@@ -0,0 +1,200 @@
+dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR 3.0
+C Intel NHM 2.75
+C Intel SBR 2.15
+C Intel IBR 1.96
+C Intel HWL 2.0
+C Intel BWL 1.65
+C Intel SKL 1.65
+C Intel atom
+C Intel SLM 4.5
+C VIA nano
+
+C NOTES
+C * It might seem natural to use the cmov insn here, but since this function
+C is supposed to have the exact same execution pattern for cnd true and
+C false, and since cmov's documentation is not clear about whether it
+C actually reads both source operands and writes the register for a false
+C condition, we cannot use it.
+C * Given that we have a dedicated cnd_add_n, it might look strange that this
+C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and
+C this file's generality might come in handy for some pipeline.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi') dnl rcx
+define(`rp', `%rsi') dnl rdx
+define(`up', `%rdx') dnl r8
+define(`vp', `%rcx') dnl r9
+define(`n', `%r8') dnl rsp+40
+
+define(`cnd', `%rbx')
+
+ifdef(`OPERATION_cnd_add_n',`
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n',`
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), R32(%r8)')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ neg cnd_arg
+ sbb cnd, cnd C make cnd mask
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): test $2, R8(n)
+ jz L(b1)
+
+L(b3): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ and cnd, %rdi
+ mov (up), %r12
+ and cnd, %r9
+ mov 8(up), %r13
+ and cnd, %r10
+ mov 16(up), %rbp
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ ADCSBB %r10, %rbp
+ mov %rbp, 16(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sub $3, n
+ jnz L(top)
+ jmp L(end)
+
+L(x0): xor R32(%rax), R32(%rax)
+ test $2, R8(n)
+ jz L(top)
+
+L(b2): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov (up), %r12
+ and cnd, %rdi
+ mov 8(up), %r13
+ and cnd, %r9
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sub $2, n
+ jnz L(top)
+ jmp L(end)
+
+L(b1): mov (vp), %rdi
+ mov (up), %r12
+ and cnd, %rdi
+ ADDSUB %rdi, %r12
+ mov %r12, (rp)
+ sbb R32(%rax), R32(%rax) C save carry
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ dec n
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov (vp), %rdi
+ mov 8(vp), %r9
+ mov 16(vp), %r10
+ mov 24(vp), %r11
+ lea 32(vp), vp
+ and cnd, %rdi
+ mov (up), %r12
+ and cnd, %r9
+ mov 8(up), %r13
+ and cnd, %r10
+ mov 16(up), %rbp
+ and cnd, %r11
+ add R32(%rax), R32(%rax) C restore carry
+ mov 24(up), %rax
+ lea 32(up), up
+ ADCSBB %rdi, %r12
+ mov %r12, (rp)
+ ADCSBB %r9, %r13
+ mov %r13, 8(rp)
+ ADCSBB %r10, %rbp
+ mov %rbp, 16(rp)
+ ADCSBB %r11, %rax
+ mov %rax, 24(rp)
+ lea 32(rp), rp
+ sbb R32(%rax), R32(%rax) C save carry
+ sub $4, n
+ jnz L(top)
+
+L(end): neg R32(%rax)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm
new file mode 100644
index 0000000..d9f371f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_divrem_1
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1)
+include_mpn(`x86_64/divrem_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..36f4512
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h
@@ -0,0 +1,241 @@
+/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 468,152,320 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 30
+
+#define DIV_1_VS_MUL_1_PERCENT 298
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 154
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 93
+#define SQR_TOOM4_THRESHOLD 248
+#define SQR_TOOM6_THRESHOLD 342
+#define SQR_TOOM8_THRESHOLD 462
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 396 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 396, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 95,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \
+ { 95, 7}, { 1535, 8}, { 831,10}, { 223, 9}, \
+ { 447,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 303,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 351,10}, { 703,11}, { 367,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,12}, { 223,11}, { 447,10}, { 895,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 607,12}, { 319,11}, { 639,12}, \
+ { 351,11}, { 703,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \
+ { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1919,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4095,15}, { 8191,16}, { 4607,15}, { 9983,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 255, 6}, \
+ { 4351, 7}, { 2303, 8}, { 1215,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,10}, { 607,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,10}, { 703,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \
+ { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \
+ { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \
+ { 2815,13}, { 1471,14}, { 767,13}, { 1599,12}, \
+ { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \
+ { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \
+ { 1791,16}, { 511,15}, { 1023,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \
+ { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \
+ { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \
+ { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 210
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 66
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 172
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 92
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 170
+#define INV_APPR_THRESHOLD 167
+
+#define BINV_NEWTON_THRESHOLD 228
+#define REDC_1_TO_REDC_2_THRESHOLD 36
+#define REDC_2_TO_REDC_N_THRESHOLD 55
+
+#define MU_DIV_QR_THRESHOLD 1387
+#define MU_DIVAPPR_Q_THRESHOLD 1387
+#define MUPI_DIV_QR_THRESHOLD 77
+#define MU_BDIV_QR_THRESHOLD 1187
+#define MU_BDIV_Q_THRESHOLD 1442
+
+#define POWM_SEC_TABLE 1,16,191,452,1297
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 21
+#define SET_STR_DC_THRESHOLD 1160
+#define SET_STR_PRECOMPUTE_THRESHOLD 2043
+
+#define FAC_DSC_THRESHOLD 426
+#define FAC_ODD_THRESHOLD 24
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD2_DIV1_METHOD 5 /* 0.74% faster than 3 */
+#define HGCD_THRESHOLD 96
+#define HGCD_APPR_THRESHOLD 60
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 345
+#define JACOBI_BASE_METHOD 1 /* 32.22% faster than 4 */
+
+/* Tuneup completed successfully, took 276198 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm
new file mode 100644
index 0000000..a1cbc31
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm
new file mode 100644
index 0000000..ac90edb
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm
new file mode 100644
index 0000000..a43a117
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm
@@ -0,0 +1,199 @@
+dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2
+C Intel NHM
+C Intel SBR 2.49
+C Intel IBR 2.32
+C Intel HWL 2.44
+C Intel BWL 2.43
+C Intel SKL 2.47
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up_param',`%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+define(`cin', `%r8') C stack
+
+define(`up', `%rsi') C same as rp_param
+define(`n', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`rp', `%rcx')')
+IFDOS(` define(`up_param',`%rdx')')
+IFDOS(` define(`n_param', `%r8')')
+IFDOS(` define(`v0', `%r9')')
+IFDOS(` define(`cin', `48(%rsp)')')
+
+IFDOS(` define(`up', `%rsi')')
+IFDOS(` define(`n', `%r8')')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ neg n
+ mul v0
+
+ test $1, R8(n)
+ jz L(x0)
+L(x1): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01)
+
+L(11): mov 8(up,n,8), %rax
+ dec n
+ jmp L(L3)
+
+L(01): inc n
+ jnz L(L1)
+ mov %rax, (rp)
+ mov %rdx, %rax
+IFDOS(` pop %rsi ')
+ ret
+
+L(x0): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up,n,8), %rax
+ test $2, R8(n)
+ jz L(L0)
+
+L(10): add $-2, n
+ jmp L(L2)
+
+ ALIGN(8)
+L(top): mov %rdx, %r10
+ add %rax, %r11
+L(L1): mov 0(up,n,8), %rax
+ adc $0, %r10
+ mul v0
+ add %rax, %r10
+ mov %r11, 0(rp,n,8)
+ mov 8(up,n,8), %rax
+ mov %rdx, %r11
+L(L0c): adc $0, %r11
+L(L0): mul v0
+ mov %r10, 8(rp,n,8)
+ add %rax, %r11
+ mov %rdx, %r10
+L(L3c): mov 16(up,n,8), %rax
+ adc $0, %r10
+L(L3): mul v0
+ mov %r11, 16(rp,n,8)
+ mov %rdx, %r11
+ add %rax, %r10
+L(L2c): mov 24(up,n,8), %rax
+ adc $0, %r11
+L(L2): mul v0
+ mov %r10, 24(rp,n,8)
+ add $4, n
+ jnc L(top)
+
+L(end): add %rax, %r11
+ mov %rdx, %rax
+ adc $0, %rax
+ mov %r11, (rp)
+
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(` push %rsi ')
+ mov (up_param), %rax
+IFSTD(` mov n_param, n ')
+ lea (up_param,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ neg n
+ mul v0
+
+ test $1, R8(n)
+ jz L(x0c)
+L(x1c): mov %rax, %r11
+ mov %rdx, %r10
+ test $2, R8(n)
+ jnz L(01c)
+
+L(11c): add cin, %r11
+ dec n
+ jmp L(L3c)
+
+L(01c): add cin, %r11
+ inc n
+ jnz L(L1)
+ mov %r11, (rp)
+ mov %rdx, %rax
+ adc $0, %rax
+IFDOS(` pop %rsi ')
+ ret
+
+L(x0c): mov %rax, %r10
+ mov %rdx, %r11
+ test $2, R8(n)
+ jz L(00c)
+
+L(10c): add $-2, n
+ add cin, %r10
+ jmp L(L2c)
+
+L(00c): add cin, %r10
+ mov 8(up,n,8), %rax
+ jmp L(L0c)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm
new file mode 100644
index 0000000..781534d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm
@@ -0,0 +1,167 @@
+dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9 8.03
+C AMD K10 8.03
+C AMD bull 9.19
+C AMD pile 9.16
+C AMD steam
+C AMD excavator
+C AMD bobcat 10.6
+C AMD jaguar 11.0
+C Intel P4 26.0
+C Intel core2 8.73
+C Intel NHM 8.55
+C Intel SBR 5.15
+C Intel IBR 4.57
+C Intel HWL 4.08
+C Intel BWL 4.10
+C Intel SKL 4.14
+C Intel atom 39.5
+C Intel SLM 26.3
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(b1)
+
+L(b0): mov $0, R32(n)
+ sub n_param, n
+ xor w0, w0
+ mul v0
+ mov %rax, w2
+ mov %rdx, w1
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b1): mov $1, R32(n)
+ sub n_param, n
+ xor w2, w2
+ mul v0
+ mov %rax, w0
+ mov %rdx, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ jmp L(lo1)
+
+ ALIGN(32)
+L(top): mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0 C 1
+ adc $0, w3 C 2
+L(lo1): add %rax, w2 C 2
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ adc $0, w0 C 3
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w2 C 2
+ mov %rdx, w1 C 3
+ adc $0, w1 C 3
+ add w3, w2 C 2
+ mov (up,n,8), %rax
+ adc $0, w1 C 1
+L(lo0): mul v1
+ mov w2, (rp,n,8) C 2
+ add %rax, w0 C 3
+ mov %rdx, w2 C 4
+ mov 8(up,n,8), %rax
+ adc $0, w2 C 4
+ add $2, n
+ jnc L(top)
+
+L(end): mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm
new file mode 100644
index 0000000..35fd1cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm
@@ -0,0 +1,407 @@
+dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.5 2.5 - 2.95
+C Intel IBR 2.4 2.3 - 2.68
+C Intel HWL 2.35 2.0 - 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Fix the addmul_2 fluctuation affecting SBR.
+C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
+C the loops at the expense of code size.
+C * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
+C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
+C speedup.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16) C FIXME
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ xor w0, w0
+ mov %rax, w2
+ mov %rdx, w1
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ xor w1, w1
+ xor w2, w2
+ mov %rax, w0
+ mov %rdx, w3
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+L(m2l1):mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,n,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+L(m2l0):mov (up,n,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,n,8)
+ add %rax, w0
+ mov %rdx, w2
+ mov 8(up,n,8), %rax
+ adc $0, w2
+ add $2, n
+ jnc L(m2tp)
+
+L(m2ed):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,un,8), %rax
+ mul v0
+ test $1, R8(un)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,un,8), X0
+ xor w0, w0
+ mov %rdx, w1
+ test $2, R8(un)
+ jnz L(a110)
+
+L(a100):lea 2(un), n C un = 4, 8, 12, ...
+ add %rax, X0
+ adc $0, w1
+ mov (up,un,8), %rax
+ mul v1
+ mov 8(rp,un,8), X1
+ jmp L(lo0)
+
+L(a110):lea (un), n C un = 2, 6, 10, ...
+ xor w3, w3
+ jmp L(lo2)
+
+L(a1x1):mov (rp,un,8), X1
+ xor w2, w2
+ xor w1, w1
+ test $2, R8(un)
+ jz L(a111)
+
+L(a101):lea 3(un), n C un = 1, 5, 9, ...
+ mov %rdx, w3
+ add %rax, X1
+ mov (up,un,8), %rax
+ mov 8(rp,un,8), X0
+ adc $0, w3
+ jmp L(top)
+
+L(a111):lea 1(un), n C un = 3, 7, 11, ...
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,n,8), %rax
+ mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,n,8), %rax
+ mul v1
+ mov X1, -24(rp,n,8)
+ mov -8(rp,n,8), X1
+ add w3, X0
+ adc $0, w1
+L(lo0): mov %rdx, w2
+ mov X0, -16(rp,n,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,n,8), %rax
+ add w0, X1
+ adc $0, w2
+ mul v0
+L(lo3): add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,n,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X1, -8(rp,n,8)
+ mov %rdx, w1
+ adc $0, w0
+L(lo2): add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,n,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,n,8), %rax
+ mov X0, (rp,n,8)
+ mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,n,8), %rax
+ mov 16(rp,n,8), X0 C useless but harmless in final iter
+ adc $0, w3
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-8(rp),-24(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I((rp),-16(rp,n,8))
+ mov %rdx, I(8(rp),-8(rp,n,8))
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea 16(rp), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+ pop %r13
+ pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm
new file mode 100644
index 0000000..a41a8ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm
@@ -0,0 +1,384 @@
+dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 2.5 2.95
+C Intel IBR 2.3 2.68
+C Intel HWL 2.0 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Implement proper cor2, replacing current cor0.
+C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
+C * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r8')
+define(`X0', `%r14')
+define(`X1', `%r15')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`i', `%rbp')
+define(`v0', `%r9')
+define(`v1', `%rbx')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+
+ mov (up), %rax
+ mov vp_param, vp
+
+ cmp $4, n
+ jb L(small)
+
+ mov (vp_param), v0
+ push %rbx
+ lea (rp,n,8), rp C point rp at R[un]
+ push %rbp
+ lea (up,n,8), up C point up right after U's end
+ push %r12
+ neg n
+ push %r13
+ mul v0
+ mov 8(vp), v1
+
+ test $1, R8(n)
+ jnz L(m2b1)
+
+L(m2b0):lea (n), i
+ xor w0, w0
+ mov %rax, w2
+ mov %rdx, w1
+ jmp L(m2l0)
+
+L(m2b1):lea 1(n), i
+ xor w1, w1
+ xor w2, w2
+ mov %rax, w0
+ mov %rdx, w3
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+L(m2l1):mov -8(up,i,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,i,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+L(m2l0):mov (up,i,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,i,8)
+ add %rax, w0
+ mov %rdx, w2 C FIXME: dead in last iteration
+ mov 8(up,i,8), %rax
+ adc $0, w2 C FIXME: dead in last iteration
+ add $2, i
+ jnc L(m2tp)
+
+L(m2ed):imul v0, %rax
+ add w0, %rax
+ add w1, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jge L(cor1)
+
+ push %r14
+ push %r15
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,n,8), %rax
+ mul v0
+ test $1, R8(n)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,n,8), X1
+ xor w2, w2
+ xor w1, w1
+ test $2, R8(n)
+ jnz L(a110)
+
+L(a100):lea 1(n), i
+ jmp L(lo0)
+
+L(a110):lea 3(n), i
+ mov %rdx, w3
+ add %rax, X1
+ mov (up,n,8), %rax
+ mov 8(rp,n,8), X0
+ adc $0, w3
+ jmp L(lo2)
+
+L(a1x1):mov (rp,n,8), X0
+ xor w0, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jz L(a111)
+
+L(a101):lea 2(n), i
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ mul v1
+ mov 8(rp,n,8), X1
+ jmp L(lo1)
+
+L(a111):lea (n), i
+ xor w3, w3
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,i,8), %rax
+ mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,i,8), %rax
+ mul v1
+ mov X1, -24(rp,i,8)
+ mov -8(rp,i,8), X1
+ add w3, X0
+ adc $0, w1
+L(lo1): mov %rdx, w2
+ mov X0, -16(rp,i,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,i,8), %rax
+ add w0, X1
+ adc $0, w2
+ mul v0
+L(lo0): add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,i,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,i,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,i,8), %rax
+ mul v0
+ add w2, X0
+ mov X1, -8(rp,i,8)
+ mov %rdx, w1
+ adc $0, w0
+L(lo3): add %rax, X0
+ adc $0, w1
+ mov (up,i,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,i,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,i,8), %rax
+ mov X0, (rp,i,8)
+ mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,i,8), %rax
+ mov 16(rp,i,8), X0
+ adc $0, w3
+ add $4, i
+ jnc L(top)
+
+L(end): imul v1, %rax
+ add %rax, X0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ mov I(-8(up),-16(up,i,8)), %rax
+ imul v0, %rax
+ add X0, %rax
+ mov X1, I(-16(rp),-24(rp,i,8))
+ add w3, %rax
+ mov %rax, I(-8(rp),-16(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jl L(outer)
+
+ pop %r15
+ pop %r14
+
+ jnz L(cor0)
+
+L(cor1):mov (vp), v0
+ mov 8(vp), v1
+ mov -16(up), %rax
+ mul v0 C u0 x v2
+ add -16(rp), %rax C FIXME: rp[0] still available in reg?
+ adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
+ mov -8(up), %r10
+ imul v0, %r10
+ mov -16(up), %r11
+ imul v1, %r11
+ mov %rax, -16(rp)
+ add %r10, %r11
+ add %rdx, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(cor0):mov (vp), %r11
+ imul -8(up), %r11
+ add %rax, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ cmp $2, n
+ jae L(gt1)
+L(n1): imul (vp_param), %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp_param), %r9
+ mul %r9
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp_param), %r9
+ mul %r9 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, %r10
+ mov 8(up), %rax
+ mul %r9 C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r11
+ mov (up), %rax
+ mul %r11 C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r11 C u1 x v1
+ add %r11, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm
new file mode 100644
index 0000000..f0dbe07
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm
@@ -0,0 +1,546 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR 3.24
+C Intel IBR 3.04
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea 8(mp_param,n,8), mp
+ lea 8(up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov %rdx, %r9
+ mov (up,n,8), %rbx
+ add %rax, %rbx
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbx
+ mov %rbx, -8(up,i,8) C next low remainder limb
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+L(e1): mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp1)
+
+L(ed1): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %r10
+ mov %rdx, %r11
+ add %rax, %r10
+ mov (mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r9
+ add %rax, %rbx
+ adc $0, %r9
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov 8(up,n,8), %r10
+ add %r11, %rbx
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbx, (up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+L(e3): add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp3)
+
+L(ed3): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea -8(up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea -8(up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea (n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov %rdx, %r9
+ mov -8(up,n,8), %rbp
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r11
+ add %rax, %rbx
+ mov 8(mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,n,8), %rbp
+ add %r9, %rbx
+ mov %rdx, %r9
+ mov %rbx, (up,n,8)
+ adc $0, %r11
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+L(e0): add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp0)
+
+L(ed0): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov -8(mp,n,8), %rax
+ mul q0
+ mov -8(up,n,8), %rbp
+ mov %rdx, %r9
+ add %rax, %rbp
+ adc $0, %r9
+ mov (mp,n,8), %rax
+ mul q0
+ mov (up,n,8), %rbx
+ mov %rdx, %r11
+ add %rax, %rbx
+ mov 8(mp,n,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %rbx
+ mov %rdx, %r9
+ mov 8(up,n,8), %rbp
+ adc $0, %r11
+ mov %rbx, (up,n,8)
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ mov -16(up,i,8), %r10
+ add %r11, %rbp
+ mov %rdx, %r11
+ adc $0, %r9
+ mov %rbp, -24(up,i,8)
+ add %rax, %r10
+ mov -8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ add %r9, %r10
+ mov %rdx, %r9
+ mov -8(up,i,8), %rbp
+ adc $0, %r11
+ mov %r10, -16(up,i,8)
+L(e2): add %rax, %rbp
+ adc $0, %r9
+ mov (mp,i,8), %rax
+ mul q0
+ mov (up,i,8), %r10
+ add %r11, %rbp
+ mov %rbp, -8(up,i,8)
+ adc $0, %r9
+ mov %rdx, %r11
+ add %rax, %r10
+ mov 8(mp,i,8), %rax
+ adc $0, %r11
+ mul q0
+ mov 8(up,i,8), %rbp
+ add %r9, %r10
+ mov %rdx, %r9
+ mov %r10, (up,i,8)
+ adc $0, %r11
+ add %rax, %rbp
+ adc $0, %r9
+ mov 16(mp,i,8), %rax
+ add $4, i
+ jnc L(tp2)
+
+L(ed2): mul q0
+ mov I(-16(up),-16(up,i,8)), %r10
+ add %r11, %rbp
+ adc $0, %r9
+ mov %rbp, I(-24(up),-24(up,i,8))
+ add %rax, %r10
+ adc $0, %rdx
+ add %r9, %r10
+ adc $0, %rdx
+ mov %r10, I(-16(up),-16(up,i,8))
+ mov %rdx, -8(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -16(up), %rax
+ adc -8(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -24(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ mov -16(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -24(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ mov -8(up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc (up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -32(mp), %rax
+ mov -32(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -24(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -24(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -16(mp), %rax
+ add %r11, %rbp
+ mov -16(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -24(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -16(up)
+ mov %r11, -32(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+ jmp L(cj)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm
new file mode 100644
index 0000000..fd2eaea
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm
@@ -0,0 +1,193 @@
+dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge.
+
+dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 4.25
+C Intel P4 21.5
+C Intel core2 3.2
+C Intel NHM 3.87
+C Intel SBR 2.05
+C Intel atom ?
+C VIA nano 44.9
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+
+ neg %r8 C set C flag from parameter
+ mov (up), %rbp
+ ADCSBB (vp), %rbp
+
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rbp
+ ADDSUB (vp), %rbp
+L(ent):
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov R32(%rbp), R32(%rax)
+ and $1, R32(%rax) C return value
+
+ mov R32(n), R32(%r11)
+ and $3, R32(%r11)
+
+ cmp $1, R32(%r11)
+ je L(do) C jump if n = 1 5 9 ...
+
+L(n1): cmp $2, R32(%r11)
+ jne L(n2) C jump unless n = 2 6 10 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r10
+ ADCSBB 8(vp), %r10
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r10, %rbp
+ mov %rbp, -8(rp)
+ jmp L(cj1)
+
+L(n2): cmp $3, R32(%r11)
+ jne L(n3) C jump unless n = 3 7 11 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r9
+ mov 16(up), %r10
+ ADCSBB 8(vp), %r9
+ ADCSBB 16(vp), %r10
+ lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r9, %rbp
+ mov %rbp, -16(rp)
+ jmp L(cj2)
+
+L(n3): dec n C come here for n = 4 8 12 ...
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ ADCSBB 24(vp), %r10
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r8, %rbp
+ mov %rbp, -24(rp)
+ shrd $1, %r9, %r8
+ mov %r8, -16(rp)
+L(cj2): shrd $1, %r10, %r9
+ mov %r9, -8(rp)
+L(cj1): mov %r10, %rbp
+
+L(do):
+ shr $2, n C 4
+ je L(end) C 2
+ ALIGN(16)
+L(top): add R32(%rbx), R32(%rbx) C restore cy
+
+ mov 8(up), %r8
+ mov 16(up), %r9
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ mov 24(up), %r10
+ mov 32(up), %r11
+ ADCSBB 24(vp), %r10
+ ADCSBB 32(vp), %r11
+
+ lea 32(up), up
+ lea 32(vp), vp
+
+ sbb R32(%rbx), R32(%rbx) C save cy
+
+ shrd $1, %r8, %rbp
+ mov %rbp, (rp)
+ shrd $1, %r9, %r8
+ mov %r8, 8(rp)
+ shrd $1, %r10, %r9
+ mov %r9, 16(rp)
+ shrd $1, %r11, %r10
+ mov %r10, 24(rp)
+
+ dec n
+ mov %r11, %rbp
+ lea 32(rp), rp
+ jne L(top)
+
+L(end): shrd $1, %rbx, %rbp
+ mov %rbp, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm
new file mode 100644
index 0000000..4c1c0d4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm
new file mode 100644
index 0000000..46a3612
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm
@@ -0,0 +1,484 @@
+dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
+C AMD K8,K9 ? ? ?
+C AMD K10 ? ? ?
+C AMD bull ? ? ?
+C AMD pile ? ? ?
+C AMD steam ? ? ?
+C AMD bobcat ? ? ?
+C AMD jaguar ? ? ?
+C Intel P4 ? ? ?
+C Intel core ? ? ?
+C Intel NHM ? ? ?
+C Intel SBR 2.57 2.93 3.0
+C Intel IBR 2.35 2.66 3.0
+C Intel HWL 2.02 2.5 2.5
+C Intel BWL ? ? ?
+C Intel atom ? ? ?
+C VIA nano ? ? ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
+C that the sqr_diag_addlsh1 loop was manually written.
+
+C TODO
+C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
+C * Streamline pointer updates.
+C * Perhaps suppress a few more xor insns in feed-in code.
+C * Make sure we write no dead registers in feed-in code.
+C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
+C out for negative sizes being zero-extended, though.
+C * The straight-line code for n <= 3 comes from the K8 code, and might be
+C quite sub-optimal here. Write specific code, and add code for n = 4.
+C * The mul_2 loop has a 10 insn common sequence in the loop start and the
+C wind-down code. Try re-rolling it.
+C * This file has been the subject to just basic micro-optimisation.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, un_param
+ jae L(gt1)
+
+ mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rax
+ mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, un_param
+ jae L(gt3)
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w2', `%r11')
+
+ mov (up), %rax
+ mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt3):
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%rbx')
+define(`w3', `%rbp')
+define(`un', `%r12')
+define(`n', `%rcx')
+
+define(`X0', `%r13')
+define(`X1', `%r14')
+
+L(do_mul_2):
+ mov (up), v0
+ push %rbx
+ lea (rp,un_param,8), rp C point rp at R[un]
+ mov 8(up), %rax
+ push %rbp
+ lea (up,un_param,8), up C point up right after U's end
+ mov %rax, v1
+ push %r12
+ mov $1, R32(un) C free up rdx
+ push %r13
+ sub un_param, un
+ push %r14
+ push un
+ mul v0
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea 2(un), n
+ xor R32(w1), R32(w1) C FIXME
+ xor R32(w2), R32(w2) C FIXME
+ mov %rdx, w0
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ xor R32(w3), R32(w3) C FIXME
+ xor R32(w0), R32(w0) C FIXME
+ mov %rdx, w2
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):
+L(m2l0):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, -8(rp,n,8)
+ mov %rdx, w0
+ adc $0, w0
+ mov (up,n,8), %rax
+L(m2l1):mul v0
+ add %rax, w2
+ mov %rdx, w1
+ adc $0, w1
+ add w3, w2
+ mov (up,n,8), %rax
+ adc $0, w1
+ mul v1
+ mov w2, (rp,n,8)
+ add %rax, w0
+ mov %rdx, w2
+ mov 8(up,n,8), %rax
+ adc $0, w2
+ add $2, n
+ jnc L(m2tp)
+
+L(m2ed):mul v0
+ add %rax, w0
+ mov %rdx, w3
+ adc $0, w3
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ add w1, w0
+ adc $0, w3
+ add %rax, w2
+ mov w0, I(-8(rp),-8(rp,n,8))
+ adc $0, %rdx
+ add w3, w2
+ mov w2, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $2, un C decrease |un|
+
+L(do_addmul_2):
+L(outer):
+ lea 16(rp), rp
+ cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1}
+ jge L(corner) C FIXME: move to before the lea above
+
+ mov -8(up,un,8), v0
+ mov (up,un,8), %rax
+ mov %rax, v1
+ mul v0
+ test $1, R8(un)
+ jnz L(a1x1)
+
+L(a1x0):mov (rp,un,8), X0
+ xor w0, w0
+ mov 8(rp,un,8), X1
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ xor w2, w2
+ mov X0, (rp,un,8)
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(a110)
+
+L(a100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(lo0)
+
+L(a110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(lo2)
+
+L(a1x1):mov (rp,un,8), X1
+ xor w2, w2
+ mov 8(rp,un,8), X0
+ add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ xor w0, w0
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jz L(a111)
+
+L(a101):lea 3(un), n C un = 1, 5, 9, ...
+ jmp L(lo1)
+
+L(a111):lea 1(un), n C un = 3, 7, 11, ...
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mul v1
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ add w1, X1
+ adc $0, w3
+ add w2, X0
+ adc $0, w0
+ mov -16(up,n,8), %rax
+L(lo1): mul v0
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ mov -16(up,n,8), %rax
+ mul v1
+ mov X1, -24(rp,n,8)
+ mov -8(rp,n,8), X1
+ add w3, X0
+ adc $0, w1
+ mov %rdx, w2
+ mov X0, -16(rp,n,8)
+ add %rax, X1
+ adc $0, w2
+ mov -8(up,n,8), %rax
+ add w0, X1
+ adc $0, w2
+L(lo0): mul v0
+ add %rax, X1
+ mov %rdx, w3
+ adc $0, w3
+ mov -8(up,n,8), %rax
+ mul v1
+ add w1, X1
+ mov (rp,n,8), X0
+ adc $0, w3
+ mov %rdx, w0
+ add %rax, X0
+ adc $0, w0
+ mov (up,n,8), %rax
+L(lo3): mul v0
+ add w2, X0
+ mov X1, -8(rp,n,8)
+ mov %rdx, w1
+ adc $0, w0
+ add %rax, X0
+ adc $0, w1
+ mov (up,n,8), %rax
+ add w3, X0
+ adc $0, w1
+ mul v1
+ mov 8(rp,n,8), X1
+ add %rax, X1
+ mov %rdx, w2
+ adc $0, w2
+ mov 8(up,n,8), %rax
+ mov X0, (rp,n,8)
+L(lo2): mul v0
+ add w0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov 8(up,n,8), %rax
+ mov 16(rp,n,8), X0
+ adc $0, w3
+ add $4, n
+ jnc L(top)
+
+L(end): mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, I(-8(rp),-24(rp,n,8))
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, I((rp),-16(rp,n,8))
+ mov %rdx, I(8(rp),-8(rp,n,8))
+
+ add $2, un C decrease |un|
+ jmp L(outer) C loop until a small corner remains
+
+L(corner):
+ pop n
+ jg L(small_corner)
+
+ lea 8(rp), rp
+ mov -24(up), v0
+ mov -16(up), %rax
+ mov %rax, v1
+ mul v0
+ mov -24(rp), X0
+ mov -16(rp), X1
+ add %rax, X0
+ mov %rdx, w1
+ adc $0, w1
+ xor w2, w2
+ mov X0, -24(rp)
+ mov -8(up), %rax
+ mul v0
+ add $0, X1
+ mov %rdx, w3
+ adc $0, w2
+ add %rax, X1
+ mov -8(up), %rax
+ adc $0, w3
+ mul v1
+ add w1, X1
+ adc $0, w3
+ add w2, %rax
+ adc $0, %rdx
+ mov X1, -16(rp)
+ jmp L(com)
+
+L(small_corner):
+ mov -8(rp), w3
+ mov -16(up), v0
+ mov -8(up), %rax
+ mul v0
+L(com): add w3, %rax
+ adc $0, %rdx
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+
+L(sqr_diag_addlsh1):
+ mov -8(up,n,8), %rax
+ shl n
+ mul %rax
+ mov %rax, (rp,n,8)
+
+ xor R32(%rbx), R32(%rbx)
+ mov 8(rp,n,8), %r8
+ mov 16(rp,n,8), %r9
+ jmp L(dm)
+
+ ALIGN(32)
+L(dtop):add %r8, %r10
+ adc %r9, %rax
+ mov 8(rp,n,8), %r8
+ mov 16(rp,n,8), %r9
+ mov %r10, -8(rp,n,8)
+ mov %rax, (rp,n,8)
+L(dm): adc %r8, %r8
+ adc %r9, %r9
+ mov (up,n,4), %rax
+ lea (%rdx,%rbx), %r10
+ setc R8(%rbx)
+ mul %rax
+ add $2, n
+ js L(dtop)
+
+L(dend):add %r8, %r10
+ adc %r9, %rax
+ mov %r10, I(-8(rp),-8(rp,n,8))
+ mov %rax, I((rp),(rp,n,8))
+ adc %rbx, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()