aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/atom
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/atom')
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm186
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm238
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aors_n.asm128
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm194
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/dive_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h222
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/lshift.asm123
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm127
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/mul_1.asm147
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/mul_2.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/popcount.asm35
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/redc_1.asm579
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm287
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/rshift.asm121
-rw-r--r--gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm242
21 files changed, 3234 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm
new file mode 100644
index 0000000..c1dcdc4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm
@@ -0,0 +1,186 @@
+dnl AMD64 mpn_addmul_2 optimised for Intel Atom.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom 18.8 this
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov n_param, n
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov (up), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ lea -8(rp), rp
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov (up), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ lea -16(up), up
+ lea -24(rp), rp
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up), %rax
+ xor R32(w1), R32(w1)
+ lea 8(up), up
+ dec n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov (up), %rax
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ lea -8(up), up
+ lea -16(rp), rp
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top):
+L(lo1): mul v1
+ add w3, (rp)
+ mov $0, R32(w2)
+ adc %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add w0, 8(rp)
+ adc %rax, w1
+ mov 8(up), %rax
+ mov $0, R32(w3)
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov 8(up), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add w1, 16(rp)
+ adc %rax, w2
+ mov 16(up), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(lo2): mul v1
+ add w2, 24(rp)
+ adc %rax, w3
+ mov 24(up), %rax
+ adc %rdx, w0
+ mov $0, R32(w1)
+ lea 32(rp), rp
+ mul v0
+ lea 32(up), up
+ add %rax, w3
+ adc %rdx, w0
+ mov -8(up), %rax
+ adc $0, R32(w1)
+ sub $4, n
+ ja L(top)
+
+L(end): mul v1
+ add w3, (rp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..f44de19
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm
@@ -0,0 +1,238 @@
+dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom.
+dnl Used also for AMD bd1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * This code is slightly large at 433 bytes.
+C * sublsh1_n.asm and this file use the same basic pattern.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 2.3
+C AMD bobcat ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.875 (4.75 is probably possible)
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh1_n)
+ define(func_nc, mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh1_n)
+ define(func_nc, mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ xor R32(%rbp), R32(%rbp)
+L(ent): mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r8
+ add %r8, %r8
+ lea 8(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 8(up), up
+ lea 8(rp), rp
+ jmp L(b0)
+
+L(b2): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ lea 16(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(b0)
+
+L(b3): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ lea 24(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 24(up), up
+ lea 24(rp), rp
+
+L(b0): test $4, R8(n)
+ jz L(skp)
+ add R32(%rax), R32(%rax) C restore scy
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ lea 32(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ mov %r11, 24(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ sbb R32(%rbp), R32(%rbp) C save acy
+
+L(skp): cmp $8, n
+ jl L(rtn)
+
+ push %r12
+ push %r13
+ push %r14
+ push %rbx
+ lea -64(rp), rp
+ jmp L(x)
+
+ ALIGN(16)
+L(top): add R32(%rax), R32(%rax) C restore scy
+ lea 64(rp), rp
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ mov 32(vp), %r12
+ adc %r12, %r12
+ mov 40(vp), %r13
+ adc %r13, %r13
+ mov 48(vp), %r14
+ adc %r14, %r14
+ mov 56(vp), %rbx
+ adc %rbx, %rbx
+ lea 64(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ ADCSBB (up), %r8
+ mov %r8, (rp)
+ ADCSBB 8(up), %r9
+ mov %r9, 8(rp)
+ ADCSBB 16(up), %r10
+ mov %r10, 16(rp)
+ ADCSBB 24(up), %r11
+ mov %r11, 24(rp)
+ ADCSBB 32(up), %r12
+ mov %r12, 32(rp)
+ ADCSBB 40(up), %r13
+ mov %r13, 40(rp)
+ ADCSBB 48(up), %r14
+ mov %r14, 48(rp)
+ ADCSBB 56(up), %rbx
+ mov %rbx, 56(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 64(up), up
+L(x): sub $8, n
+ jge L(top)
+
+L(end): pop %rbx
+ pop %r14
+ pop %r13
+ pop %r12
+L(rtn):
+ifdef(`OPERATION_addlsh1_n',`
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)')
+ifdef(`OPERATION_rsblsh1_n',`
+ sub R32(%rax), R32(%rbp)
+ movslq R32(%rbp), %rax')
+
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ neg %r8 C set CF
+ sbb R32(%rbp), R32(%rbp) C save acy
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..02fb29d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm
@@ -0,0 +1,191 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+dnl Optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5.75
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+define(`LSH', 2)
+define(`RSH', 62)
+define(M, eval(m4_lshift(1,LSH)))
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_addlsh2_n)
+ define(func_nc, mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsblsh2_n)
+ define(func_nc, mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0) C we rely on rax = 0 at target
+ cmp $2, R32(%rax)
+ mov $0, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r9
+ lea (%rax,%r9,M), %rbp
+ shr $RSH, %r9
+ sub $1, n
+ lea -8(up), up
+ lea -8(rp), rp
+ jz L(cj1)
+ mov 8(vp), %r10
+ lea (%r9,%r10,M), %r9
+ shr $RSH, %r10
+ mov 16(vp), %r11
+ lea 24(vp), vp
+ mov (vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ jmp L(L1)
+
+L(b2): lea -32(rp), rp
+ mov (vp), %r8
+ lea -32(up), up
+ lea (%rax,%r8,M), %rbx
+ shr $RSH, %r8
+ mov 8(vp), %r9
+ sub $2, n
+ jle L(end)
+ jmp L(top)
+
+L(b3): lea -24(up), up
+ mov (vp), %r11
+ lea -24(rp), rp
+ mov 8(vp), %r8
+ lea (%rax,%r11,M), %r10
+ shr $RSH, %r11
+ lea 8(vp), vp
+ lea (%r11,%r8,M), %rbx
+ add $1, n
+ jmp L(L3)
+
+L(b0): lea -16(up), up
+ mov (vp), %r10
+ lea (%rax,%r10,M), %r9
+ shr $RSH, %r10
+ mov 8(vp), %r11
+ lea -16(rp), rp
+ mov 16(vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ lea 16(vp), vp
+ jmp L(L0)
+
+ ALIGN(16)
+L(top): lea (%r8,%r9,M), %rbp
+ shr $RSH, %r9
+ lea 32(up), up
+ mov 16(vp), %r10
+ lea (%r9,%r10,M), %r9
+ shr $RSH, %r10
+ mov 24(vp), %r11
+ lea 32(rp), rp
+ lea 32(vp), vp
+ mov (vp), %r8
+ lea (%r10,%r11,M), %r10
+ shr $RSH, %r11
+ add R32(%rax), R32(%rax)
+ ADCSBB (up), %rbx
+ mov %rbx, (rp)
+L(L1): ADCSBB 8(up), %rbp
+ mov %rbp, 8(rp)
+L(L0): ADCSBB 16(up), %r9
+ lea (%r11,%r8,M), %rbx
+ mov %r9, 16(rp)
+L(L3): ADCSBB 24(up), %r10
+ sbb R32(%rax), R32(%rax)
+L(L2): shr $RSH, %r8
+ mov 8(vp), %r9
+ mov %r10, 24(rp)
+ sub $4, n
+ jg L(top)
+
+L(end): lea (%r8,%r9,M), %rbp
+ shr $RSH, %r9
+ lea 32(up), up
+ lea 32(rp), rp
+ add R32(%rax), R32(%rax)
+ ADCSBB (up), %rbx
+ mov %rbx, (rp)
+L(cj1): ADCSBB 8(up), %rbp
+ mov %rbp, 8(rp)
+
+ifdef(`OPERATION_addlsh2_n',`
+ mov R32(n), R32(%rax) C zero rax
+ adc %r9, %rax')
+ifdef(`OPERATION_rsblsh2_n',`
+ sbb n, %r9 C subtract 0
+ mov %r9, %rax')
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm
new file mode 100644
index 0000000..83b8df9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm
@@ -0,0 +1,128 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom.
+
+dnl Copyright 2011, 2017 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Marco Bodrato. Ported to 64-bit by
+dnl Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 2
+C AMD bull 2.34\2.63
+C AMD pile 2.27\2.52
+C AMD steam
+C AMD excavator
+C AMD bobcat 2.79
+C AMD jaguar 2.78
+C Intel P4 11
+C Intel core2 7.5
+C Intel NHM 8.5
+C Intel SBR 2.11
+C Intel IBR 2.07
+C Intel HWL 1.75
+C Intel BWL 1.51
+C Intel SKL 1.52
+C Intel atom 3
+C Intel SLM 4
+C VIA nano
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func_n, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func_n, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ xor cy, cy C carry
+
+L(com): shr n C n >> 1
+ jz L(1) C n == 1
+ jc L(1m2) C n % 2 == 1
+
+L(0m2): shr cy
+ mov (up), %r10
+ lea 8(up), up
+ lea 8(vp), vp
+ lea -8(rp), rp
+ jmp L(mid)
+
+L(1): shr cy
+ mov (up), %r9
+ jmp L(end)
+
+L(1m2): shr cy
+ mov (up), %r9
+
+ ALIGN(16)
+L(top): ADCSBB (vp), %r9
+ lea 16(up), up
+ mov -8(up), %r10
+ lea 16(vp), vp
+ mov %r9, (rp)
+L(mid): ADCSBB -8(vp), %r10
+ lea 16(rp), rp
+ dec n
+ mov (up), %r9
+ mov %r10, -8(rp)
+ jnz L(top)
+
+L(end): ADCSBB (vp), %r9
+ mov $0, R32(%rax)
+ mov %r9, (rp)
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), cy ')
+ jmp L(com)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm
new file mode 100644
index 0000000..7cbc085
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm
@@ -0,0 +1,194 @@
+dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bull 4.73
+C AMD pile 4.60 4.80
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.48
+C AMD jaguar 5.61
+C Intel P4 16.6
+C Intel core2 5.09
+C Intel NHM 4.79
+C Intel SBR 3.88
+C Intel IBR 3.65
+C Intel HWL 3.53
+C Intel BWL 2.75
+C Intel SKL 2.76
+C Intel atom 19.4
+C Intel SLM 8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $1, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ mov %rdx, %r10
+ mul v0
+ mov %rax, %r8
+ mov 16(up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r11
+ mov -8(up,n,8), %rax
+ mov %rdx, %r10
+ mul v0
+ test n, n
+ jns L(cj2)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ mov %rdx, %r9
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n_param)
+ jnz L(b11)
+
+L(b01): mov $2, R32(n)
+ sub n_param, n
+ mul v0
+ test n, n
+ jns L(cj1)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ mov %rdx, %r9
+ mul v0
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ mov %rdx, %r10
+ jmp L(lo1)
+
+L(b11): xor R32(n), R32(n)
+ sub n_param, n
+ mul v0
+ mov %rax, %r8
+ mov 16(up,n,8), %rax
+ mov %rdx, %r9
+ mul v0
+ mov %rax, %r11
+ mov 24(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top): mul v0
+ ADDSUB %r8, -16(rp,n,8)
+ mov %rax, %r8
+ mov (up,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+L(lo2): mul v0
+ ADDSUB %r11, -8(rp,n,8)
+ mov %rax, %r11
+ mov 8(up,n,8), %rax
+ adc %r10, %r8
+ mov %rdx, %r10
+ adc $0, %r9
+L(lo1): mul v0
+ ADDSUB %r8, (rp,n,8)
+ mov %rax, %r8
+ adc %r9, %r11
+ mov 16(up,n,8), %rax
+ adc $0, %r10
+L(lo0): mov %rdx, %r9
+ mul v0
+ ADDSUB %r11, 8(rp,n,8)
+ mov %rax, %r11
+ adc %r10, %r8
+ mov 24(up,n,8), %rax
+ adc $0, %r9
+L(lo3): add $4, n
+ mov %rdx, %r10
+ js L(top)
+
+L(end): mul v0
+ ADDSUB %r8, -16(rp,n,8)
+ adc %r9, %r11
+ adc $0, %r10
+L(cj2): ADDSUB %r11, -8(rp,n,8)
+ adc %r10, %rax
+ adc $0, %rdx
+L(cj1): ADDSUB %rax, (rp,n,8)
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm
new file mode 100644
index 0000000..fcb9a0f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_cnd_add_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n)
+include_mpn(`x86_64/coreisbr/cnd_add_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..9eee1c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_cnd_sub_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_sub_n)
+include_mpn(`x86_64/coreisbr/cnd_sub_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/com.asm b/gmp-6.3.0/mpn/x86_64/atom/com.asm
new file mode 100644
index 0000000..6b6460f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for Intel Atom.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyd.asm b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm
new file mode 100644
index 0000000..e309279
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for Intel Atom.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyi.asm b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm
new file mode 100644
index 0000000..00ec3c2
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for Intel Atom.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm
new file mode 100644
index 0000000..d9ba5fe
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86_64/nano/dive_1.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h
new file mode 100644
index 0000000..2cd90f6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define SHLD_SLOW 1
+#define SHRD_SLOW 1
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 50,646,641 */
+/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define DIV_1_VS_MUL_1_PERCENT 201
+
+#define MUL_TOOM22_THRESHOLD 12
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 106
+#define MUL_TOOM6H_THRESHOLD 155
+#define MUL_TOOM8H_THRESHOLD 212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58
+
+#define SQR_BASECASE_THRESHOLD 5
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 130
+#define SQR_TOOM6_THRESHOLD 159
+#define SQR_TOOM8_THRESHOLD 236
+
+#define MULMID_TOOM42_THRESHOLD 16
+
+#define MULMOD_BNM1_THRESHOLD 9
+#define SQRMOD_BNM1_THRESHOLD 9
+
+#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \
+ { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \
+ { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \
+ { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \
+ { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \
+ { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \
+ { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \
+ { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \
+ { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \
+ { 71, 9}, { 143, 8}, { 287,10}, { 79,11}, \
+ { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \
+ { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \
+ { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,10}, { 207,11}, { 111,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319,11}, { 175,10}, { 351,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,11}, { 223,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 383,10}, { 767,12}, { 223,11}, \
+ { 447,13}, { 127,12}, { 255,11}, { 511,12}, \
+ { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 575,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 767,13}, \
+ { 447,14}, { 255,13}, { 511,12}, { 1023,13}, \
+ { 575,12}, { 1151,13}, { 703,14}, { 383,13}, \
+ { 831,12}, { 1663,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1407,13}, { 2815,15}, { 767,14}, { 1791,16}, \
+ { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,15}, { 1535,14}, { 16384,15}, \
+ { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \
+ { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+ {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 169
+#define MUL_FFT_THRESHOLD 2240
+
+#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 184, 5}, { 11, 6}, { 13, 7}, { 7, 6}, \
+ { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \
+ { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
+ { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \
+ { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \
+ { 23,10}, { 15, 9}, { 39,10}, { 23, 9}, \
+ { 47,11}, { 15,10}, { 31, 9}, { 63, 8}, \
+ { 127, 7}, { 255,10}, { 39, 8}, { 159,10}, \
+ { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \
+ { 71, 9}, { 143, 8}, { 287, 7}, { 575, 9}, \
+ { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \
+ { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \
+ { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \
+ { 287, 8}, { 575,10}, { 159, 9}, { 319, 8}, \
+ { 639,10}, { 175, 9}, { 351,11}, { 95,10}, \
+ { 191, 9}, { 383,11}, { 111,10}, { 223, 9}, \
+ { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \
+ { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \
+ { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \
+ { 767,11}, { 223,10}, { 447,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
+ { 575,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,12}, { 287,11}, { 575,12}, { 319,11}, \
+ { 639,12}, { 351,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 575,13}, { 319,12}, { 703,13}, { 383,12}, \
+ { 767,13}, { 447,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,15}, { 255,14}, \
+ { 511,13}, { 1151,14}, { 639,13}, { 1407,12}, \
+ { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \
+ { 1791,15}, { 511,14}, { 1023,13}, { 2047,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1791,16}, { 511,15}, \
+ { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,15}, { 1535,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 172
+#define SQR_FFT_THRESHOLD 1728
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 33
+#define MULLO_MUL_N_THRESHOLD 4392
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 85
+#define SQRLO_SQR_THRESHOLD 3176
+
+#define DC_DIV_QR_THRESHOLD 34
+#define DC_DIVAPPR_Q_THRESHOLD 119
+#define DC_BDIV_QR_THRESHOLD 31
+#define DC_BDIV_Q_THRESHOLD 76
+
+#define INV_MULMOD_BNM1_THRESHOLD 22
+#define INV_NEWTON_THRESHOLD 149
+#define INV_APPR_THRESHOLD 123
+
+#define BINV_NEWTON_THRESHOLD 179
+#define REDC_1_TO_REDC_2_THRESHOLD 24
+#define REDC_2_TO_REDC_N_THRESHOLD 39
+
+#define MU_DIV_QR_THRESHOLD 807
+#define MU_DIVAPPR_Q_THRESHOLD 807
+#define MUPI_DIV_QR_THRESHOLD 77
+#define MU_BDIV_QR_THRESHOLD 748
+#define MU_BDIV_Q_THRESHOLD 807
+
+#define POWM_SEC_TABLE 1,22,114,326,1486
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 30
+#define SET_STR_DC_THRESHOLD 381
+#define SET_STR_PRECOMPUTE_THRESHOLD 1565
+
+#define FAC_DSC_THRESHOLD 960
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD2_DIV1_METHOD 3 /* 5.86% faster than 4 */
+#define HGCD_THRESHOLD 88
+#define HGCD_APPR_THRESHOLD 88
+#define HGCD_REDUCE_THRESHOLD 1182
+#define GCD_DC_THRESHOLD 241
+#define GCDEXT_DC_THRESHOLD 192
+#define JACOBI_BASE_METHOD 3 /* 9.43% faster than 2 */
+
+/* Tuneup completed successfully, took 193098 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshift.asm b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm
new file mode 100644
index 0000000..1b37d5d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm
@@ -0,0 +1,123 @@
+dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
+C larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ dec n
+ lea 8(rp), rp
+ lea -8(up), up
+ jz L(end)
+
+ ALIGN(8)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ mov -8(up), %r8
+ mov %r8, %r10
+ mov %r9, -8(rp)
+ shr R8(%rcx), %r8
+ lea -16(rp), rp
+L(lo1): mov -16(up), %r9
+ or %r11, %r8
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ mov %r8, (rp)
+ dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm
new file mode 100644
index 0000000..7385f8f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm
@@ -0,0 +1,127 @@
+dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5
+C times larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shl R8(%rcx), %r11
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ test n, n
+ jnz L(gt1)
+ not %r11
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov -8(up), %r8
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(%rcx)
+ shr R8(%rcx), %rax
+ mov -8(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ neg R8(%rcx)
+ lea 8(rp), rp
+ lea -8(up), up
+ jmp L(lo0)
+
+C ALIGN(16)
+L(top): shl R8(%rcx), %r10
+ or %r10, %r9
+ shl R8(%rcx), %r11
+ not %r9
+ neg R8(%rcx)
+ mov -8(up), %r8
+ lea -16(rp), rp
+ mov %r8, %r10
+ shr R8(%rcx), %r8
+ mov %r9, 8(rp)
+L(lo1): or %r11, %r8
+ mov -16(up), %r9
+ mov %r9, %r11
+ shr R8(%rcx), %r9
+ lea -16(up), up
+ neg R8(%rcx)
+ not %r8
+ mov %r8, (rp)
+L(lo0): dec n
+ jg L(top)
+
+L(end): shl R8(%rcx), %r10
+ or %r10, %r9
+ not %r9
+ shl R8(%rcx), %r11
+ not %r11
+ mov %r9, -8(rp)
+ mov %r11, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm
new file mode 100644
index 0000000..a0dcf1e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm
@@ -0,0 +1,147 @@
+dnl AMD64 mpn_mul_1 optimised for Intel Atom.
+
+dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.03
+C AMD K10 3.03
+C AMD bull 4.74
+C AMD pile 4.56
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56 6.04
+C AMD jaguar 5.55 5.84
+C Intel P4 13.05
+C Intel core2 4.03
+C Intel NHM 3.80
+C Intel SBR 2.75
+C Intel IBR 2.69
+C Intel HWL 2.50
+C Intel BWL 2.55
+C Intel SKL 2.57
+C Intel atom 17.3
+C Intel SLM 14.7
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(com): mov (up), %rax
+ lea -16(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ test $1, R8(n_param)
+ jnz L(bx1)
+
+L(bx0): mov %r8, %r9
+ test $2, R8(n_param)
+ jnz L(b10)
+
+L(b00): mov $2, R32(n)
+ sub n_param, n
+ jmp L(lo0)
+
+L(bx1): test $2, R8(n_param)
+ jnz L(b11)
+
+L(b01): mov $3, R32(n)
+ sub n_param, n
+ mul v0
+ cmp $2, n
+ jnz L(lo1)
+ jmp L(cj1)
+
+L(b11): mov $1, R32(n)
+ sub n_param, n
+ jmp L(lo3)
+
+L(b10): xor R32(n), R32(n)
+ sub n_param, n
+ jmp L(lo2)
+
+L(top): mul v0
+ mov %r9, -24(rp,n,8)
+L(lo1): xor %r9d, %r9d
+ add %rax, %r8
+ mov (up,n,8), %rax
+ adc %rdx, %r9
+ mov %r8, -16(rp,n,8)
+L(lo0): xor %r8d, %r8d
+ mul v0
+ add %rax, %r9
+ mov 8(up,n,8), %rax
+ adc %rdx, %r8
+ mov %r9, -8(rp,n,8)
+L(lo3): xor %r9d, %r9d
+ mul v0
+ add %rax, %r8
+ mov 16(up,n,8), %rax
+ adc %rdx, %r9
+ mov %r8, (rp,n,8)
+L(lo2): xor %r8d, %r8d
+ mul v0
+ add %rax, %r9
+ mov 24(up,n,8), %rax
+ adc %rdx, %r8
+ add $4, n
+ js L(top)
+
+L(end): mul v0
+ mov %r9, -8(rp)
+L(cj1): add %rax, %r8
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ mov %r8, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(com)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm
new file mode 100644
index 0000000..4bc22cd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_mul_2 optimised for Intel Atom.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C AMD K8,K9 5.78
+C AMD K10 5.78
+C AMD bull 9.10
+C AMD pile 9.17
+C AMD steam
+C AMD excavator
+C AMD bobcat 11.3
+C AMD jaguar 10.9
+C Intel P4 24.6
+C Intel core2 8.06
+C Intel NHM 7.65
+C Intel SBR 6.28
+C Intel IBR 6.10
+C Intel HWL 6.09
+C Intel BWL 4.73
+C Intel SKL 4.77
+C Intel atom 35.3
+C Intel SLM 25.6
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov n_param, n
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov (up), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ lea -8(rp), rp
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov (up), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ lea -16(up), up
+ lea -24(rp), rp
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up), %rax
+ xor R32(w1), R32(w1)
+ lea 8(up), up
+ dec n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov (up), %rax
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ lea -8(up), up
+ lea -16(rp), rp
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top):
+L(lo1): mul v1
+ add %rax, w0
+ mov (up), %rax
+ mov $0, R32(w2)
+ mov w3, (rp)
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov (up), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add %rax, w1
+ mov 8(up), %rax
+ mov w0, 8(rp)
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov 8(up), %rax
+ adc %rdx, w2
+ mov $0, R32(w3)
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up), %rax
+ mov w1, 16(rp)
+ mov $0, R32(w0)
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up), %rax
+ adc %rdx, w3
+L(lo2): mov $0, R32(w1)
+ mov w2, 24(rp)
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov 24(up), %rax
+ lea 32(up), up
+ adc %rdx, w0
+ mul v0
+ lea 32(rp), rp
+ add %rax, w3
+ adc %rdx, w0
+ mov -8(up), %rax
+ adc $0, R32(w1)
+ sub $4, n
+ ja L(top)
+
+L(end): mul v1
+ mov w3, (rp)
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/popcount.asm b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm
new file mode 100644
index 0000000..fb14dd3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm
@@ -0,0 +1,35 @@
+dnl x86-64 mpn_popcount.
+
+dnl Copyright 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm
new file mode 100644
index 0000000..62b9a84
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm
@@ -0,0 +1,579 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Atom.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+C * Make lead-in code for the inner loops be more similar.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ adc %r9, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 32(mp,n,8), %rax
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+L(e1): add $4, i
+ mov %rdx, %r10
+ js L(tp1)
+
+L(ed1): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %rbp
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ add (up,n,8), %rbp
+ mov %rax, %rbp
+ mov 24(mp,n,8), %rax
+ adc %r9, %rbx
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %r11
+ mov 32(mp,n,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+L(e3): mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp3)
+
+L(ed3): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0): cmp $-4, R32(n)
+ jz L(n4)
+
+L(otp0):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %r11
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up,n,8), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+L(e0): mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+ mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp0)
+
+L(ed0): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, %r11
+ mov 8(mp,n,8), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov 16(mp,n,8), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up,n,8), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov 24(mp,n,8), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ mov %rax, %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): mul q0
+ add %rbp, -24(up,i,8)
+ mov %rax, %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %rdx, %r9
+ adc $0, %r10
+ mul q0
+ add %r11, -16(up,i,8)
+ mov %rax, %r11
+ mov 8(mp,i,8), %rax
+ adc %r10, %rbp
+ mov %rdx, %r10
+ adc $0, %r9
+ mul q0
+ add %rbp, -8(up,i,8)
+ mov %rax, %rbp
+ adc %r9, %r11
+ mov 16(mp,i,8), %rax
+ adc $0, %r10
+ mov %rdx, %r9
+L(e2): mul q0
+ add %r11, (up,i,8)
+ mov %rax, %r11
+ adc %r10, %rbp
+ mov 24(mp,i,8), %rax
+ adc $0, %r9
+ add $4, i
+ mov %rdx, %r10
+ js L(tp2)
+
+L(ed2): mul q0
+ add %rbp, I(-24(up),-24(up,i,8))
+ adc %r9, %r11
+ adc $0, %r10
+ add %r11, I(-16(up),-16(up,i,8))
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, I(-8(up),-8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n4): mov -32(mp), %rax
+ mul q0
+ mov %rax, %r11
+ mov -24(mp), %rax
+ mov %rdx, %r10
+ mul q0
+ mov %rax, %rbx
+ mov -16(mp), %rax
+ mov %rdx, %r9
+ mul q0
+ add -32(up), %r11
+ mov %rax, %r11
+ adc %r10, %rbx
+ mov -8(mp), %rax
+ adc $0, %r9
+ mov %rdx, %r10
+ mul q0
+ add -24(up), %rbx
+ mov %rbx, -24(up)
+ adc %r9, %r11
+ adc $0, %r10
+ imul u0inv, %rbx C next q limb
+ add %r11, -16(up)
+ adc %r10, %rax
+ adc $0, %rdx
+ add %rax, -8(up)
+ adc $0, %rdx
+ mov %rdx, -32(up) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ dec j
+ lea 8(up), up C up++
+ jnz L(n4)
+ jmp L(cj)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm
new file mode 100644
index 0000000..6f5f638
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm
@@ -0,0 +1,287 @@
+dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Schedule loop less. It is now almost surely overscheduled, resulting in
+C large feed-in and wind-down code.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NMH ?
+C Intel SBR ?
+C Intel atom 5.25
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), %r15
+ ADDSUB (vp), %r15
+ sbb R32(%rbx), R32(%rbx)
+ xor R32(%rax), R32(%rax)
+ shr %r15
+ adc R32(%rax), R32(%rax) C return value
+
+ mov R32(n), R32(%rbp)
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jae L(b23)
+
+L(b1): dec n
+ jnz L(gt1)
+ shl $63, %rbx
+ add %rbx, %r15
+ mov %r15, (rp)
+ jmp L(cj1)
+L(gt1): lea 24(up), up
+ lea 24(vp), vp
+ mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ lea 24(rp), rp
+ mov (up), %r11
+ ADCSBB -16(vp), %r9
+ ADCSBB -8(vp), %r10
+ mov %r15, %r12
+ ADCSBB (vp), %r11
+ mov %r9, %r13
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ mov %r10, %r14
+ shl $63, %r11
+ shl $63, %r10
+ shl $63, %r9
+ or %r9, %r12
+ shr %r13
+ mov 8(up), %r8
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ sub $4, n
+ jz L(cj5)
+L(gt5): mov 16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov 32(up), %r11
+ jmp L(lo1)
+
+L(b23): jnz L(b3)
+ mov 8(up), %r8
+ sub $2, n
+ jnz L(gt2)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ mov %r8, %r12
+ jmp L(cj2)
+L(gt2): mov 16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov 32(up), %r11
+ ADCSBB 16(vp), %r9
+ lea 32(up), up
+ ADCSBB 24(vp), %r10
+ mov %r9, %r13
+ ADCSBB 32(vp), %r11
+ mov %r8, %r12
+ jmp L(lo2)
+
+L(b3): lea 40(up), up
+ lea 8(vp), vp
+ mov %r15, %r14
+ add R32(%rbx), R32(%rbx)
+ mov -32(up), %r11
+ ADCSBB 0(vp), %r11
+ lea 8(rp), rp
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ shl $63, %r11
+ mov -24(up), %r8
+ shr %r15
+ or %r11, %r14
+ sub $3, n
+ jnz L(gt3)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ jmp L(cj3)
+L(gt3): mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov (up), %r11
+ ADCSBB 16(vp), %r9
+ ADCSBB 24(vp), %r10
+ mov %r8, %r12
+ jmp L(lo3)
+
+L(b0): lea 48(up), up
+ lea 16(vp), vp
+ add R32(%rbx), R32(%rbx)
+ mov -40(up), %r10
+ lea 16(rp), rp
+ mov -32(up), %r11
+ ADCSBB -8(vp), %r10
+ mov %r15, %r13
+ ADCSBB (vp), %r11
+ sbb R32(%rbx), R32(%rbx)
+ mov %r11, %r15
+ mov %r10, %r14
+ shl $63, %r11
+ shl $63, %r10
+ mov -24(up), %r8
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ sub $4, n
+ jnz L(gt4)
+ add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ jmp L(cj4)
+L(gt4): mov -16(up), %r9
+ add R32(%rbx), R32(%rbx)
+ mov -8(up), %r10
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ mov (up), %r11
+ ADCSBB 16(vp), %r9
+ jmp L(lo0)
+
+ ALIGN(8)
+L(top): mov 16(up), %r9
+ shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ add R32(%rbx), R32(%rbx)
+ mov 24(up), %r10
+ mov %rbp, (rp)
+ ADCSBB 8(vp), %r8
+ mov %r15, %rbp
+ lea 32(rp), rp
+ mov 32(up), %r11
+L(lo1): ADCSBB 16(vp), %r9
+ lea 32(up), up
+ mov %r12, -24(rp)
+L(lo0): ADCSBB 24(vp), %r10
+ mov %r8, %r12
+ mov %r13, -16(rp)
+L(lo3): ADCSBB 32(vp), %r11
+ mov %r9, %r13
+ mov %r14, -8(rp)
+L(lo2): sbb R32(%rbx), R32(%rbx)
+ shl $63, %r8
+ mov %r11, %r15
+ shr %r12
+ mov %r10, %r14
+ shl $63, %r9
+ lea 32(vp), vp
+ shl $63, %r10
+ or %r8, %rbp
+ shl $63, %r11
+ or %r9, %r12
+ shr %r13
+ mov 8(up), %r8
+ sub $4, n
+ jg L(top)
+
+L(end): shr %r14
+ or %r10, %r13
+ shr %r15
+ or %r11, %r14
+ mov %rbp, (rp)
+ lea 32(rp), rp
+L(cj5): add R32(%rbx), R32(%rbx)
+ ADCSBB 8(vp), %r8
+ mov %r12, -24(rp)
+L(cj4): mov %r13, -16(rp)
+L(cj3): mov %r8, %r12
+ mov %r14, -8(rp)
+L(cj2): sbb R32(%rbx), R32(%rbx)
+ shl $63, %r8
+ shr %r12
+ or %r8, %r15
+ shl $63, %rbx
+ add %rbx, %r12
+ mov %r15, (rp)
+ mov %r12, 8(rp)
+L(cj1): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/rshift.asm b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm
new file mode 100644
index 0000000..29c027d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm
@@ -0,0 +1,121 @@
+dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 4.5
+C VIA nano ?
+
+C TODO
+C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times
+C larger.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ shr R32(n)
+ mov (up), %rax
+ jnc L(evn)
+
+ mov %rax, %r11
+ shr R8(cnt), %r11
+ neg R8(cnt)
+ shl R8(cnt), %rax
+ test n, n
+ jnz L(gt1)
+ mov %r11, (rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): mov 8(up), %r8
+ mov %r8, %r10
+ shl R8(cnt), %r8
+ jmp L(lo1)
+
+L(evn): mov %rax, %r10
+ neg R8(cnt)
+ shl R8(cnt), %rax
+ mov 8(up), %r9
+ mov %r9, %r11
+ shl R8(cnt), %r9
+ neg R8(cnt)
+ dec n
+ lea -8(rp), rp
+ lea 8(up), up
+ jz L(end)
+
+ ALIGN(8)
+L(top): shr R8(cnt), %r10
+ or %r10, %r9
+ shr R8(cnt), %r11
+ neg R8(cnt)
+ mov 8(up), %r8
+ mov %r8, %r10
+ mov %r9, 8(rp)
+ shl R8(cnt), %r8
+ lea 16(rp), rp
+L(lo1): mov 16(up), %r9
+ or %r11, %r8
+ mov %r9, %r11
+ shl R8(cnt), %r9
+ lea 16(up), up
+ neg R8(cnt)
+ mov %r8, (rp)
+ dec n
+ jg L(top)
+
+L(end): shr R8(cnt), %r10
+ or %r10, %r9
+ shr R8(cnt), %r11
+ mov %r9, 8(rp)
+ mov %r11, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm
new file mode 100644
index 0000000..1306acd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm
@@ -0,0 +1,242 @@
+dnl AMD64 mpn_sublsh1_n optimised for Intel Atom.
+dnl Used also for AMD bd1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * This code is slightly large at 501 bytes.
+C * aorrlsh1_n.asm and this file use the same basic pattern.
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 2.3
+C AMD bobcat ?
+C Intel P4 ?
+C Intel core2 ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel atom 5 (4.875 is probably possible)
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+ FUNC_ENTRY(4)
+ push %rbp
+ push %r15
+ xor R32(%rbp), R32(%rbp)
+L(ent): mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov (vp), %r8
+ add %r8, %r8
+ lea 8(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 8(up), up
+ lea 8(rp), rp
+ jmp L(b0)
+
+L(b2): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ lea 16(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 16(up), up
+ lea 16(rp), rp
+ jmp L(b0)
+
+L(b3): mov (vp), %r8
+ add %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ lea 24(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ sbb R32(%rbp), R32(%rbp) C save acy
+ lea 24(up), up
+ lea 24(rp), rp
+
+L(b0): test $4, R8(n)
+ jz L(skp)
+ add R32(%rax), R32(%rax) C restore scy
+ mov (vp), %r8
+ adc %r8, %r8
+ mov 8(vp), %r9
+ adc %r9, %r9
+ mov 16(vp), %r10
+ adc %r10, %r10
+ mov 24(vp), %r11
+ adc %r11, %r11
+ lea 32(vp), vp
+ sbb R32(%rax), R32(%rax) C save scy
+ add R32(%rbp), R32(%rbp) C restore acy
+ mov (up), %r15
+ sbb %r8, %r15
+ mov %r15, (rp)
+ mov 8(up), %r15
+ sbb %r9, %r15
+ mov %r15, 8(rp)
+ mov 16(up), %r15
+ sbb %r10, %r15
+ mov %r15, 16(rp)
+ mov 24(up), %r15
+ sbb %r11, %r15
+ mov %r15, 24(rp)
+ lea 32(up), up
+ lea 32(rp), rp
+ sbb R32(%rbp), R32(%rbp) C save acy
+
+L(skp): cmp $8, n
+ jl L(rtn)
+
+ push %r12
+ push %r13
+ push %r14
+ push %rbx
+ lea -64(rp), rp
+ jmp L(x)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ add R32(%rax), R32(%rax)
+ lea 64(vp), vp
+ adc %r8, %r8
+ mov -56(vp), %r9
+ adc %r9, %r9
+ mov -48(vp), %r10
+ adc %r10, %r10
+ mov -40(vp), %r11
+ adc %r11, %r11
+ mov -32(vp), %r12
+ adc %r12, %r12
+ mov -24(vp), %r13
+ adc %r13, %r13
+ mov -16(vp), %r14
+ adc %r14, %r14
+ mov -8(vp), %r15
+ adc %r15, %r15
+ sbb R32(%rax), R32(%rax)
+ add R32(%rbp), R32(%rbp)
+ mov (up), %rbp
+ lea 64(rp), rp
+ mov 8(up), %rbx
+ sbb %r8, %rbp
+ mov 32(up), %r8
+ mov %rbp, (rp)
+ sbb %r9, %rbx
+ mov 16(up), %rbp
+ mov %rbx, 8(rp)
+ sbb %r10, %rbp
+ mov 24(up), %rbx
+ mov %rbp, 16(rp)
+ sbb %r11, %rbx
+ mov %rbx, 24(rp)
+ sbb %r12, %r8
+ mov 40(up), %r9
+ mov %r8, 32(rp)
+ sbb %r13, %r9
+ mov 48(up), %rbp
+ mov %r9, 40(rp)
+ sbb %r14, %rbp
+ mov 56(up), %rbx
+ mov %rbp, 48(rp)
+ sbb %r15, %rbx
+ lea 64(up), up
+ mov %rbx, 56(rp)
+ sbb R32(%rbp), R32(%rbp)
+L(x): sub $8, n
+ jge L(top)
+
+L(end): pop %rbx
+ pop %r14
+ pop %r13
+ pop %r12
+L(rtn):
+ add R32(%rbp), R32(%rax)
+ neg R32(%rax)
+
+ pop %r15
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+PROLOGUE(mpn_sublsh1_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ push %r15
+ neg %r8 C set CF
+ sbb R32(%rbp), R32(%rbp) C save acy
+ jmp L(ent)
+EPILOGUE()