aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/bd1
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/bd1
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/bd1')
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/README11
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm235
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h265
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm206
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm193
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm416
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/popcount.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm37
18 files changed, 2236 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/README b/gmp-6.3.0/mpn/x86_64/bd1/README
new file mode 100644
index 0000000..ccd210e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/README
@@ -0,0 +1,11 @@
+This directory contains code for AMD bulldozer including its piledriver update.
+
+We currently make limited use of SIMD instructions, both via the MPN_PATH and
+via inclusion of x86_64/fastsse files.
+
+The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably
+means that an all-core GMP load (such as a HPC load) might run slower if there
+is significant SIMD dependency.
+
+We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
+SIMD code.
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm
new file mode 100644
index 0000000..b54e91a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm
@@ -0,0 +1,235 @@
+dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1 4.2
+C AMD bd2 4.4
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1
+C AMD bt2
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rbx')
+define(`v1', `%rbp')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up), %rax
+ mov $0, R32(w2) C abuse w2
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+ sub n_param, w2
+ mul v0
+
+ test $1, R8(w2)
+ jnz L(bx1)
+
+L(bx0): mov %rdx, X0
+ mov %rax, X1
+ test $2, R8(w2)
+ jnz L(b10)
+
+L(b00): lea (w2), n C un = 4, 8, 12, ...
+ mov (up,w2,8), %rax
+ mov (rp,w2,8), w3
+ mul v1
+ mov %rax, w0
+ mov 8(up,w2,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(w2), n C un = 2, 6, 10, ...
+ mov (up,w2,8), %rax
+ mov (rp,w2,8), w1
+ mul v1
+ mov %rdx, w3
+ mov %rax, w2
+ mov -8(up,n,8), %rax
+ test n, n
+ jz L(end)
+ jmp L(top)
+
+L(bx1): mov %rax, X0
+ mov %rdx, X1
+ test $2, R8(w2)
+ jz L(b11)
+
+L(b01): lea 1(w2), n C un = 1, 5, 9, ...
+ mov (up,w2,8), %rax
+ mul v1
+ mov (rp,w2,8), w2
+ mov %rdx, w0
+ mov %rax, w3
+ jmp L(lo1)
+
+L(b11): lea -1(w2), n C un = 3, 7, 11, ...
+ mov (up,w2,8), %rax
+ mul v1
+ mov (rp,w2,8), w0
+ mov %rax, w1
+ mov 8(up,w2,8), %rax
+ mov %rdx, w2
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,n,8)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up,n,8), %rax
+ mul v1
+ mov -8(rp,n,8), w1
+ mov %rdx, w0
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X0, -8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ mov (up,n,8), %rax
+ adc $0, X0
+ mov (rp,n,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(lo0): mul v0
+ add w3, X1
+ mov X1, (rp,n,8)
+ adc %rax, X0
+ mov 8(up,n,8), %rax
+ mov %rdx, X1
+ adc $0, X1
+ mov 8(rp,n,8), w3
+ mul v1
+ add w3, w0
+ adc %rax, w1
+ mov 16(up,n,8), %rax
+ mov %rdx, w2
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ adc $0, X0
+ mov 16(up,n,8), %rax
+ mov 16(rp,n,8), w0
+ mul v1
+ mov %rdx, w3
+ add w0, w1
+ adc %rax, w2
+ adc $0, w3
+ mov 24(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ add w1, X1
+ mov X1, -16(rp)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up), %rax
+ mul v1
+ mov -8(rp), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, %rdx
+ add w2, X0
+ adc $0, X1
+ mov X0, -8(rp)
+ add w3, X1
+ mov X1, (rp)
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm
new file mode 100644
index 0000000..c34a5fa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm
new file mode 100644
index 0000000..5516c9d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm
new file mode 100644
index 0000000..143c42e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreihwl/aors_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm
new file mode 100644
index 0000000..fc0d2fe
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm
@@ -0,0 +1,190 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.30 3.58
+C AMD K10 3.09
+C AMD bull 4.47 4.72
+C AMD pile 4.66
+C AMD steam
+C AMD excavator
+C AMD bobcat 6.30
+C AMD jaguar 6.29
+C Intel P4 17.3 17.8
+C Intel core2 5.13
+C Intel NHM 4.85
+C Intel SBR 3.83
+C Intel IBR 3.75
+C Intel HWL 3.45
+C Intel BWL 2.56
+C Intel SKL 2.53
+C Intel atom 20.3
+C Intel SLM 9
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``r11'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %rbx ')
+ mul v0
+
+IFSTD(` mov %rbx, n ')
+
+ and $3, R32(%rbx)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov $0, R32(%r8)
+ mov %rax, %rbx
+ mov $0, R32(%r9)
+ mov 8(up), %rax
+ mov %rdx, %r10
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov $0, R32(%r10)
+ mov %rax, %r8
+ mov %rdx, %rbx
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): cmp $1, n
+ jz L(n1)
+ mov %rax, %r9
+ mov 8(up), %rax
+ mov %rdx, %r8
+ mov $0, R32(%rbx)
+ lea (up,n,8), up
+ neg n
+ inc n
+ jmp L(L1)
+
+L(b2): mov $0, R32(%rbx)
+ mov %rax, %r10
+ mov %rdx, %r9
+ mov 8(up), %rax
+ mov $0, R32(%r8)
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(32)
+L(top): mul v0
+ ADDSUB %r10, (rp,n,8)
+ adc %rax, %r9
+ mov (up,n,8), %rax
+ adc %rdx, %r8
+L(L1): mul v0
+ mov $0, R32(%r10)
+ ADDSUB %r9, 8(rp,n,8)
+ adc %rax, %r8
+ adc %rdx, %rbx
+ mov 8(up,n,8), %rax
+L(L0): mul v0
+ ADDSUB %r8, 16(rp,n,8)
+ mov $0, R32(%r8)
+ adc %rax, %rbx
+ mov $0, R32(%r9)
+ mov 16(up,n,8), %rax
+ adc %rdx, %r10
+L(L3): mul v0
+ ADDSUB %rbx, 24(rp,n,8)
+ mov $0, R32(%rbx)
+ adc %rax, %r10
+ adc %rdx, %r9
+ mov 24(up,n,8), %rax
+ add $4, n
+ js L(top)
+
+L(end): mul v0
+ ADDSUB %r10, (rp)
+ adc %r9, %rax
+ adc %r8, %rdx
+L(n1): ADDSUB %rax, 8(rp)
+ adc $0, %rdx
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/com.asm b/gmp-6.3.0/mpn/x86_64/bd1/com.asm
new file mode 100644
index 0000000..43f3561
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm
new file mode 100644
index 0000000..675cdc3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm
new file mode 100644
index 0000000..ceef036
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for AMD bd1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h
new file mode 100644
index 0000000..210f382
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h
@@ -0,0 +1,265 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 464,627,200 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 2
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 27
+
+#define DIV_1_VS_MUL_1_PERCENT 275
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 57
+#define MUL_TOOM44_THRESHOLD 161
+#define MUL_TOOM6H_THRESHOLD 226
+#define MUL_TOOM8H_THRESHOLD 339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 24
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 234
+#define SQR_TOOM6_THRESHOLD 286
+#define SQR_TOOM8_THRESHOLD 466
+
+#define MULMID_TOOM42_THRESHOLD 20
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 15
+
+#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \
+ { 28, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \
+ { 99,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 103,12}, { 31,11}, { 63, 7}, \
+ { 1023, 8}, { 543, 9}, { 303,10}, { 167,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 255,11}, { 143,10}, { 287,11}, { 159,12}, \
+ { 95,11}, { 191,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,12}, { 159,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \
+ { 191,12}, { 383,11}, { 767,12}, { 415,11}, \
+ { 831,12}, { 447,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \
+ { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,10}, { 2687,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \
+ { 1599,12}, { 831,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \
+ { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \
+ { 895,15}, { 255,14}, { 511,13}, { 1023,12}, \
+ { 2047,13}, { 1087,12}, { 2175,13}, { 1215,12}, \
+ { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1919,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \
+ { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \
+ { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \
+ { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \
+ { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \
+ { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \
+ { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \
+ { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \
+ { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \
+ { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 251
+#define MUL_FFT_THRESHOLD 4544
+
+#define SQR_FFT_MODF_THRESHOLD 364 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 364, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \
+ { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95, 7}, \
+ { 1535, 8}, { 799, 7}, { 1599, 8}, { 831, 9}, \
+ { 447,10}, { 239,11}, { 127,10}, { 255,11}, \
+ { 143,10}, { 303,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 303,12}, { 159,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \
+ { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \
+ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \
+ { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \
+ { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \
+ { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \
+ { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \
+ { 895,14}, { 255,13}, { 511,12}, { 1023,11}, \
+ { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \
+ { 383,13}, { 767,12}, { 1599,11}, { 3199,13}, \
+ { 831,12}, { 1727,11}, { 3455,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \
+ { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \
+ { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \
+ { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,11}, { 6911,14}, { 895,13}, { 1791,12}, \
+ { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \
+ { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \
+ { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \
+ { 4351,12}, { 8703,13}, { 4479,12}, { 8959,14}, \
+ { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \
+ { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4351,13}, { 8703,14}, \
+ { 4479,13}, { 8959,15}, { 2303,14}, { 4991,13}, \
+ { 9983,15}, { 2559,14}, { 5119,15}, { 2815,14}, \
+ { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \
+ { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \
+ { 4095,14}, { 8191,15}, { 4351,14}, { 8959,15}, \
+ { 4863,14}, { 9983,16}, { 2559,15}, { 5887,14}, \
+ { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \
+ { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 275
+#define SQR_FFT_THRESHOLD 3264
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 23
+#define MULLO_MUL_N_THRESHOLD 8907
+#define SQRLO_BASECASE_THRESHOLD 9
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 6440
+
+#define DC_DIV_QR_THRESHOLD 52
+#define DC_DIVAPPR_Q_THRESHOLD 167
+#define DC_BDIV_QR_THRESHOLD 48
+#define DC_BDIV_Q_THRESHOLD 93
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 197
+#define INV_APPR_THRESHOLD 179
+
+#define BINV_NEWTON_THRESHOLD 230
+#define REDC_1_TO_REDC_2_THRESHOLD 32
+#define REDC_2_TO_REDC_N_THRESHOLD 55
+
+#define MU_DIV_QR_THRESHOLD 1387
+#define MU_DIVAPPR_Q_THRESHOLD 1387
+#define MUPI_DIV_QR_THRESHOLD 92
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1334
+
+#define POWM_SEC_TABLE 1,22,194,434,452
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 438
+#define SET_STR_PRECOMPUTE_THRESHOLD 1254
+
+#define FAC_DSC_THRESHOLD 189
+#define FAC_ODD_THRESHOLD 26
+
+#define MATRIX22_STRASSEN_THRESHOLD 14
+#define HGCD2_DIV1_METHOD 3 /* 2.31% faster than 4 */
+#define HGCD_THRESHOLD 104
+#define HGCD_APPR_THRESHOLD 52
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 465
+#define GCDEXT_DC_THRESHOLD 283
+#define JACOBI_BASE_METHOD 4 /* 5.81% faster than 1 */
+
+/* Tuneup completed successfully, took 554602 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm
new file mode 100644
index 0000000..799cdda
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm
@@ -0,0 +1,206 @@
+dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.51-2.0 y
+C AMD bd2 1.50-1.9 y
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen n/a
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL n/a
+C Intel SKL n/a
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/core2/hamdist.asm')
+',`
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ FUNC_ENTRY(3)
+ cmp $5, n
+ jl L(sma)
+
+ lea L(cnsts)(%rip), %r9
+
+ xor R32(%r10), R32(%r10)
+ test $8, R8(vp)
+ jz L(ali)
+ mov (up), %r8
+ xor (vp), %r8
+ add $8, up
+ add $8, vp
+ dec n
+ popcnt %r8, %r10
+L(ali):
+
+ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
+ `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
+ movdqa OFF1`'(%r9), %xmm7 C nibble counts table
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm5 C masks
+ pxor %xmm4, %xmm4
+ pxor %xmm8, %xmm8 C grand total count
+
+ mov R32(n), R32(%rax)
+ and $6, R32(%rax)
+ lea -64(up,%rax,8), up
+ lea -64(vp,%rax,8), vp
+ifdef(`PIC',`
+ movslq (%r9,%rax,2), %r11
+ add %r9, %r11
+ jmp *%r11
+',`
+ jmp *(%r9,%rax,4)
+')
+
+L(0): add $64, up
+ add $64, vp
+ sub $2, n
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm0
+ pxor (vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(6): lddqu 16(up), %xmm0
+ pxor 16(vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(4): lddqu 32(up), %xmm0
+ pxor 32(vp), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm5, %xmm0
+ pand %xmm5, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+ paddb %xmm2, %xmm3
+ paddb %xmm2, %xmm4
+ paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts
+L(2): mov 48(up), %r8
+ mov 56(up), %r9
+ add $64, up
+ xor 48(vp), %r8
+ xor 56(vp), %r9
+ add $64, vp
+ popcnt %r8, %r8
+ popcnt %r9, %r9
+ add %r8, %r10
+ add %r9, %r10
+ sub $8, n
+ jg L(top)
+
+ test $1, R8(n)
+ jz L(x)
+ mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %r8
+ add %r8, %r10
+L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0
+ paddq %xmm0, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ add %r10, %rax
+ FUNC_EXIT()
+ ret
+
+L(sma): mov (up), %r8
+ xor (vp), %r8
+ popcnt %r8, %rax
+ dec n
+ jz L(ed)
+L(tp): mov 8(up), %r8
+ add $8, up
+ xor 8(vp), %r8
+ add $8, vp
+ popcnt %r8, %r8
+ add %r8, %rax
+ dec n
+ jnz L(tp)
+L(ed): FUNC_EXIT()
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(0), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm
new file mode 100644
index 0000000..2fb097f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm
@@ -0,0 +1,193 @@
+dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 3.65
+C AMD K10 3.30 3.68
+C AMD bull 4.04 4.29
+C AMD pile 4.33
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.73
+C AMD jaguar 5.87
+C Intel P4 12.5
+C Intel core2 4.38
+C Intel NHM 4.28
+C Intel SBR 2.69
+C Intel IBR 2.55
+C Intel HWL 2.41
+C Intel BWL 2.49
+C Intel SKL 2.50
+C Intel atom 20.3
+C Intel SLM 7.8
+C VIA nano 4.25
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Move loop code into feed-in blocks, to save insn for zeroing regs.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0', `%rcx') C r9
+
+define(`n', `%rbx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(` define(`up', ``%rsi'') ') dnl
+IFDOS(` define(`rp', ``%rcx'') ') dnl
+IFDOS(` define(`v0', ``%r9'') ') dnl
+IFDOS(` define(`r9', ``rdi'') ') dnl
+IFDOS(` define(`n', ``%r8'') ') dnl
+IFDOS(` define(`r8', ``rbx'') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+IFSTD(` add %r8, %rax ')
+IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns)
+ adc $0, %rdx
+ jmp L(common)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(``push %rsi '')
+IFDOS(``push %rdi '')
+IFDOS(``mov %rdx, %rsi '')
+
+ mov (up), %rax C read first u limb early
+ push %rbx
+IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it
+IFDOS(` mov n, %r11 ')
+ mul v0
+
+L(common):
+IFSTD(` mov %r11, n ')
+
+ and $3, R32(%r11)
+ lea -16(rp,n,8), rp
+ jz L(b0)
+ cmp $2, R32(%r11)
+ jb L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up), %rax
+ mul v0
+ lea (up,n,8), up
+ not n
+ jmp L(L3)
+
+L(b0): mov %rax, %r9
+ mov %rdx, %r10
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ jmp L(L0)
+
+L(b1): mov %rax, %r8
+ cmp $1, n
+ jz L(n1)
+ mov %rdx, %r9
+ lea (up,n,8), up
+ neg n
+ mov %r8, 16(rp,n,8)
+ inc n
+ jmp L(L1)
+
+L(b2): mov %rax, %r11
+ mov %rdx, %r8
+ mov 8(up), %rax
+ lea (up,n,8), up
+ neg n
+ add $2, n
+ jns L(end)
+
+ ALIGN(16)
+L(top): mul v0
+ mov %rdx, %r9
+ add %rax, %r8
+ adc $0, %r9
+ mov %r8, 8(rp,n,8)
+ mov %r11, (rp,n,8)
+L(L1): mov (up,n,8), %rax
+ mul v0
+ add %rax, %r9
+ mov %rdx, %r10
+ mov 8(up,n,8), %rax
+ adc $0, %r10
+L(L0): mul v0
+ add %rax, %r10
+ mov %rdx, %r11
+ mov 16(up,n,8), %rax
+ adc $0, %r11
+ mul v0
+ mov %r9, 16(rp,n,8)
+L(L3): add %rax, %r11
+ mov %r10, 24(rp,n,8)
+ mov %rdx, %r8
+ adc $0, %r8
+ add $4, n
+ mov -8(up,n,8), %rax
+ js L(top)
+
+L(end): mul v0
+ add %rax, %r8
+ adc $0, %rdx
+ mov %r11, (rp)
+L(n1): mov %r8, 8(rp)
+ mov %rdx, %rax
+
+ pop %rbx
+IFDOS(``pop %rdi '')
+IFDOS(``pop %rsi '')
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm
new file mode 100644
index 0000000..85fa7aa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 6.78
+C AMD K10 6.78
+C AMD bd1 8.39 8.65
+C AMD bd2 8.47
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1 12.1
+C AMD bt2 11.5
+C Intel P4 24.0
+C Intel PNR 8.14
+C Intel NHM 7.78
+C Intel SBR 6.34
+C Intel IBR 6.15
+C Intel HWL 6.04
+C Intel BWL 4.33
+C Intel SKL 4.41
+C Intel atom 39.5
+C Intel SLM 27.8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`vp', `%rcx') C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_2)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %rax
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mov n_param, n
+ mul v0
+ neg n
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov (up,n,8), %rax
+ jmp L(lo0)
+
+L(b10): mov %rax, w2
+ mov %rdx, w3
+ mov (up,n,8), %rax
+ xor R32(w0), R32(w0)
+ mul v1
+ add $-2, n
+ jmp L(lo2)
+
+L(bx1): test $2, R8(n)
+ jz L(b11)
+
+L(b01): mov %rax, w3
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ mul v1
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b11): mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ xor R32(w3), R32(w3)
+ dec n
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top): mov -8(up,n,8), %rax
+ mul v1
+ mov w2, -16(rp,n,8)
+L(lo1): add %rax, w0
+ mov w3, -8(rp,n,8)
+ adc %rdx, w1
+ mov (up,n,8), %rax
+ mul v0
+ mov $0, R32(w2)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mov (up,n,8), %rax
+L(lo0): mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov w0, (rp,n,8)
+ mov $0, R32(w3)
+ mov 8(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(lo3): mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ mov $0, R32(w0)
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov w1, 8(rp,n,8)
+L(lo2): add %rax, w3
+ adc %rdx, w0
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ mov $0, R32(w1)
+ adc $0, R32(w1)
+ add $4, n
+ jnc L(top)
+
+L(end): mov -8(up), %rax
+ mul v1
+ mov w2, -16(rp)
+ add %rax, w0
+ mov w3, -8(rp)
+ adc %rdx, w1
+ mov w0, (rp)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm
new file mode 100644
index 0000000..e47ba58
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm
@@ -0,0 +1,416 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull ~4.8 ~4.55 - ~4.3
+C AMD pile ~4.6 ~4.55 - ~4.55
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C Alternatively, we could tweak the present code (which was loopmixed for a
+C different CPU).
+C * Merge faster mul_2, such as the one in the same directory as this file.
+C * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`un', `%rbx')
+
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ mov un_param, un C free up rdx
+ neg un
+
+ mov (up), %rax C shared for mul_1 and mul_2
+ lea (up,un_param,8), up C point at operand end
+ lea (rp,un_param,8), rp C point at rp[un-1]
+
+ mov (vp), v0 C shared for mul_1 and mul_2
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn)
+ jz L(do_mul_2)
+
+L(do_mul_1):
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
+ mov %rdx, w1
+ mov 8(up,un,8), %rax
+ test $2, R8(un)
+ jnz L(m110)
+
+L(m100):lea 2(un), n C un = 4, 8, 12, ...
+ jmp L(m1l0)
+
+L(m110):lea (un), n C un = 2, 6, 10, ...
+ jmp L(m1l2)
+
+L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
+ mov %rdx, w0
+ test $2, R8(un)
+ jz L(m111)
+
+L(m101):lea 3(un), n C un = 1, 5, 9, ...
+ test n, n
+ js L(m1l1)
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(m111):lea 1(un), n C un = 3, 7, 11, ...
+ mov 8(up,un,8), %rax
+ jmp L(m1l3)
+
+ ALIGN(16)
+L(m1tp):mov %rdx, w0
+ add %rax, w1
+L(m1l1):mov -16(up,n,8), %rax
+ adc $0, w0
+ mul v0
+ add %rax, w0
+ mov w1, -24(rp,n,8)
+ mov -8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(m1l0):mul v0
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov %rdx, w0
+ mov (up,n,8), %rax
+ adc $0, w0
+L(m1l3):mul v0
+ mov w1, -8(rp,n,8)
+ mov %rdx, w1
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc $0, w1
+L(m1l2):mul v0
+ mov w0, (rp,n,8)
+ add $4, n
+ jnc L(m1tp)
+
+L(m1ed):add %rax, w1
+ adc $0, %rdx
+ mov w1, I(-8(rp),-24(rp,n,8))
+ mov %rdx, I((rp),-16(rp,n,8))
+
+ dec R32(vn)
+ jz L(ret2)
+
+ lea 8(vp), vp
+ lea 8(rp), rp
+ push %r12
+ push %r13
+ push %r14
+ jmp L(do_addmul)
+
+L(do_mul_2):
+define(`v1', `%r14')
+ push %r12
+ push %r13
+ push %r14
+
+ mov 8(vp), v1
+
+ test $1, R8(un)
+ jnz L(m2b1)
+
+L(m2b0):lea (un), n
+ mov %rax, w2 C 0
+ mov (up,un,8), %rax
+ mov %rdx, w1 C 1
+ mul v1
+ mov %rax, w0 C 1
+ mov w2, (rp,un,8) C 0
+ mov 8(up,un,8), %rax
+ mov %rdx, w2 C 2
+ jmp L(m2l0)
+
+L(m2b1):lea 1(un), n
+ mov %rax, w0 C 1
+ mov %rdx, w3 C 2
+ mov (up,un,8), %rax
+ mul v1
+ mov w0, (rp,un,8) C 1
+ mov %rdx, w0 C 3
+ mov %rax, w2 C 0
+ mov 8(up,un,8), %rax
+ jmp L(m2l1)
+
+ ALIGN(32)
+L(m2tp):add %rax, w2 C 0
+ mov (up,n,8), %rax
+ adc $0, w0 C 1
+L(m2l1):mul v0
+ add %rax, w2 C 0
+ mov (up,n,8), %rax
+ mov %rdx, w1 C 1
+ adc $0, w1 C 1
+ mul v1
+ add w3, w2 C 0
+ adc $0, w1 C 1
+ add %rax, w0 C 1
+ mov w2, (rp,n,8) C 0
+ mov 8(up,n,8), %rax
+ mov %rdx, w2 C 2
+ adc $0, w2 C 2
+L(m2l0):mul v0
+ add %rax, w0 C 1
+ mov %rdx, w3 C 2
+ adc $0, w3 C 2
+ add w1, w0 C 1
+ adc $0, w3 C 2
+ mov 8(up,n,8), %rax
+ mul v1
+ add $2, n
+ mov w0, -8(rp,n,8) C 1
+ mov %rdx, w0 C 3
+ jnc L(m2tp)
+
+L(m2ed):add %rax, w2
+ adc $0, %rdx
+ add w3, w2
+ adc $0, %rdx
+ mov w2, I((rp),(rp,n,8))
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+ add $-2, R32(vn)
+ jz L(ret5)
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+
+L(do_addmul):
+ push %r15
+ push vn C save vn in new stack slot
+define(`vn', `(%rsp)')
+define(`X0', `%r14')
+define(`X1', `%r15')
+define(`v1', `%r8')
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+
+ mov (up,un,8), %rax
+ mul v0
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): mov %rax, X1
+ mov (up,un,8), %rax
+ mov %rdx, X0
+ mul v1
+ test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), n C un = 4, 8, 12, ...
+ mov (rp,un,8), w3
+ mov %rax, w0
+ mov 8(up,un,8), %rax
+ mov %rdx, w1
+ jmp L(lo0)
+
+L(b10): lea 2(un), n C un = 2, 6, 10, ...
+ mov (rp,un,8), w1
+ mov %rdx, w3
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
+L(bx1): mov %rax, X0
+ mov (up,un,8), %rax
+ mov %rdx, X1
+ mul v1
+ test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), n C un = 1, 5, 9, ...
+ mov (rp,un,8), w2
+ mov %rdx, w0
+ mov %rax, w3
+ jmp L(lo1)
+
+L(b11): lea -1(un), n C un = 3, 7, 11, ...
+ mov (rp,un,8), w0
+ mov %rax, w1
+ mov 8(up,un,8), %rax
+ mov %rdx, w2
+ jmp L(lo3)
+
+ ALIGN(32)
+L(top):
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,n,8)
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov -8(up,n,8), %rax
+ mul v1
+ mov -8(rp,n,8), w1
+ mov %rdx, w0
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,n,8), %rax
+ mul v0
+ add w2, X0
+ mov X0, -8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ mov (up,n,8), %rax
+ adc $0, X0
+ mov (rp,n,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ adc $0, w1
+L(lo0): mul v0
+ add w3, X1
+ mov X1, (rp,n,8)
+ adc %rax, X0
+ mov 8(up,n,8), %rax
+ mov %rdx, X1
+ adc $0, X1
+ mov 8(rp,n,8), w3
+ mul v1
+ add w3, w0
+ adc %rax, w1
+ mov 16(up,n,8), %rax
+ mov %rdx, w2
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,n,8)
+ mov %rdx, X0
+ adc %rax, X1
+ adc $0, X0
+ mov 16(up,n,8), %rax
+ mov 16(rp,n,8), w0
+ mul v1
+ mov %rdx, w3
+ add w0, w1
+ adc %rax, w2
+ adc $0, w3
+ mov 24(up,n,8), %rax
+ add $4, n
+ jnc L(top)
+
+L(end): mul v0
+ add w1, X1
+ mov X1, I(-16(rp),-16(rp,n,8))
+ mov %rdx, X1
+ adc %rax, X0
+ adc $0, X1
+ mov I(-8(up),-8(up,n,8)), %rax
+ mul v1
+ mov I(-8(rp),-8(rp,n,8)), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, %rdx
+ add w2, X0
+ adc $0, X1
+ mov X0, I(-8(rp),-8(rp,n,8))
+ add w3, X1
+ mov X1, I((rp),(rp,n,8))
+ adc $0, %rdx
+ mov %rdx, I(8(rp),8(rp,n,8))
+
+
+ addl $-2, vn
+ lea 16(vp), vp
+ lea 16(rp), rp
+ jnz L(outer)
+
+ pop %rax C deallocate vn slot
+ pop %r15
+L(ret5):pop %r14
+ pop %r13
+ pop %r12
+L(ret2):pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
new file mode 100644
index 0000000..7b084f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
@@ -0,0 +1,191 @@
+dnl AMD64 SSSE3/XOP mpn_popcount -- population count.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.27 y
+C AMD bd2 1.24 y
+C AMD bd3 ?
+C AMD bd4 1.22
+C AMD zen n/a
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL n/a
+C Intel SKL n/a
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/core2/popcount.asm')
+',`
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(3)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+ `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+ movdqa OFF1`'(%r9), %xmm7 C nibble counts table
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm9 C masks
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5 C 0-reg
+ pxor %xmm8, %xmm8 C grand total count
+
+ xor R32(%rdx), R32(%rdx)
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx
+ add $8, up
+ dec n
+ jnz L(top)
+ mov %rdx, %rax
+ FUNC_EXIT()
+ ret
+
+L(2): add $-48, up
+ jmp L(e2)
+
+L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-40, up
+ jmp L(e2)
+
+L(4): add $-32, up
+ jmp L(e4)
+
+L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-24, up
+ jmp L(e4)
+
+L(6): add $-16, up
+ jmp L(e6)
+
+L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-8, up
+ jmp L(e6)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+ paddb %xmm2, %xmm4
+L(e2): popcnt 48(up), %r8
+ popcnt 56(up), %r9
+ add $64, up
+ paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts
+ add %r8, %rdx
+ add %r9, %rdx
+ sub $8, n
+ jg L(top)
+
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ paddq %xmm5, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ add %rdx, %rax
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm
new file mode 100644
index 0000000..4ba673d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_sublsh1_n
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')