aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/bt1
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/bt1')
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm159
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm191
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/copyd.asm91
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/copyi.asm94
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm119
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h230
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm241
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm486
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm507
-rw-r--r--gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm565
11 files changed, 2720 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm
new file mode 100644
index 0000000..9b6b5c7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm
@@ -0,0 +1,159 @@
+dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1.77
+C AMD K10 1.76\1.82
+C AMD bd1 1.67\2.12
+C AMD bd2 1.62\1.82
+C AMD bd3
+C AMD bd4 1.55\2.2
+C AMD zen
+C AMD bt1 2.54
+C AMD bt2 2
+C Intel P4 11
+C Intel PNR 4.76
+C Intel NHM 5.27
+C Intel SBR 2
+C Intel IBR 1.94
+C Intel HWL 1.63
+C Intel BWL 1.51
+C Intel SKL 1.51
+C Intel atom 3.56
+C Intel SLM 4
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(ent): test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): shr $2, n
+ neg %r8
+ mov $3, R32(%rax)
+ mov (up), %r10
+ mov 8(up), %r11
+ jmp L(lo0)
+
+L(b10): shr $2, n
+ neg %r8
+ mov $1, R32(%rax)
+ mov (up), %r8
+ mov 8(up), %r9
+ jrcxz L(cj2)
+ jmp L(top)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): shr $2, n
+ neg %r8
+ mov $0, R32(%rax)
+ mov (up), %r9
+ jrcxz L(cj1)
+ mov 8(up), %r10
+ jmp L(lo1)
+
+ ALIGN(8)
+L(b11): inc n
+ shr $2, n
+ neg %r8
+ mov $2, R32(%rax)
+ mov (up), %r11
+ jmp L(lo3)
+
+ ALIGN(4)
+L(top): mov 8(up,%rax,8), %r10
+ ADCSBB -8(vp,%rax,8), %r8
+ mov %r8, -8(rp,%rax,8)
+L(lo1): mov 16(up,%rax,8), %r11
+ ADCSBB (vp,%rax,8), %r9
+ lea 4(%rax), %rax
+ mov %r9, -32(rp,%rax,8)
+L(lo0): ADCSBB -24(vp,%rax,8), %r10
+ mov %r10, -24(rp,%rax,8)
+L(lo3): ADCSBB -16(vp,%rax,8), %r11
+ dec n
+ mov -8(up,%rax,8), %r8
+ mov %r11, -16(rp,%rax,8)
+L(lo2): mov (up,%rax,8), %r9
+ jnz L(top)
+
+L(cj2): ADCSBB -8(vp,%rax,8), %r8
+ mov %r8, -8(rp,%rax,8)
+L(cj1): ADCSBB (vp,%rax,8), %r9
+ mov %r9, (rp,%rax,8)
+
+ mov $0, R32(%rax)
+ adc $0, R32(%rax)
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm
new file mode 100644
index 0000000..41e1d8a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm
@@ -0,0 +1,191 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.52 old measurement
+C AMD K10 4.51 old measurement
+C AMD bd1 4.66 old measurement
+C AMD bd2 4.57 old measurement
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bt1 5.04
+C AMD bt2 5.07
+C Intel P4 16.8 18.6 old measurement
+C Intel PNR 5.59 old measurement
+C Intel NHM 5.39 old measurement
+C Intel SBR 3.93 old measurement
+C Intel IBR 3.59 old measurement
+C Intel HWL 3.61 old measurement
+C Intel BWL 2.76 old measurement
+C Intel SKL 2.77 old measurement
+C Intel atom 23 old measurement
+C Intel SLM 8 old measurement
+C Intel GLM ?
+C VIA nano 5.63 old measurement
+
+C The ALIGNment here might look completely ad-hoc. They are not.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+define(`v0', `%rcx')
+C Standard allocations
+define(`n', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp', `%rcx') ') dnl
+IFDOS(` define(`up', `%rsi') ') dnl
+IFDOS(` define(`n_param', `%r8') ') dnl
+IFDOS(` define(`v0', `%r9') ') dnl
+C DOS64 allocations
+IFDOS(` define(`n', `%rbx') ') dnl
+IFDOS(` define(`w0', `%r8') ') dnl
+IFDOS(` define(`w1', `%rdi') ') dnl
+IFDOS(` define(`w2', `%r10') ') dnl
+IFDOS(` define(`w3', `%r11') ') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(func)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(bx1)
+
+L(bx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(b00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(bx1): mul v0
+ test $2, R8(n)
+ je L(b01)
+
+L(b11): mov %rax, w2
+ mov %rdx, w3
+ neg n
+ inc n
+ jmp L(L3)
+
+ ALIGN(16)
+L(b01): sub $3, n
+ jc L(n1)
+ mov %rax, w2
+ mov %rdx, w3
+ neg n
+
+ ALIGN(16)
+L(top): mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ ADDSUB w2, -24(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(L0): mov -8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ ADDSUB w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(L3): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ ADDSUB w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(L2): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ ADDSUB w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ add $4, n
+ js L(top)
+
+L(end): xor R32(%rax), R32(%rax)
+ ADDSUB w2, -8(rp)
+ adc w3, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+
+ ALIGN(32)
+L(n1): ADDSUB %rax, -8(rp)
+ mov $0, R32(%rax)
+ adc %rdx, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm
new file mode 100644
index 0000000..877714e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm
@@ -0,0 +1,91 @@
+dnl AMD64 mpn_copyd optimised for AMD bobcat.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1-2 (alignment fluctuations)
+C AMD bd1 ?
+C AMD bobcat 1.5
+C Intel P4 2.8
+C Intel core2 1
+C Intel NHM 1-1.25
+C Intel SBR 1
+C Intel atom 2.87
+C VIA nano 2
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+ sub $4, n
+ jl L(end)
+ ALIGN(16)
+L(top): mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+L(ent): sub $4, n
+ jge L(top)
+
+L(end): cmp $-4, R32(n)
+ jz L(ret)
+ mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+ cmp $-3, R32(n)
+ jz L(ret)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ cmp $-2, R32(n)
+ jz L(ret)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm
new file mode 100644
index 0000000..ee0f578
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm
@@ -0,0 +1,94 @@
+dnl AMD64 mpn_copyi optimised for AMD bobcat.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 1
+C AMD K10 1-2 (alignment fluctuations)
+C AMD bd1 ?
+C AMD bobcat 1.5
+C Intel P4 2.8
+C Intel core2 1
+C Intel NHM 1-1.25
+C Intel SBR 1
+C Intel atom 2.87
+C VIA nano 2
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+ lea -32(up,n,8), up
+ lea -32(rp,n,8), rp
+ neg n
+ add $4, n
+ jg L(end)
+ ALIGN(16)
+L(top): mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+ mov 24(up,n,8), %r8
+ mov %r8, 24(rp,n,8)
+L(ent): add $4, n
+ jle L(top)
+
+L(end): cmp $4, R32(n)
+ jz L(ret)
+ mov (up,n,8), %r8
+ mov %r8, (rp,n,8)
+ cmp $3, R32(n)
+ jz L(ret)
+ mov 8(up,n,8), %r8
+ mov %r8, 8(rp,n,8)
+ cmp $2, R32(n)
+ jz L(ret)
+ mov 16(up,n,8), %r8
+ mov %r8, 16(rp,n,8)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm
new file mode 100644
index 0000000..ef53392
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm
@@ -0,0 +1,119 @@
+dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 5.4
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR ?
+C Intel PNR ?
+C Intel NHM ?
+C Intel WSM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+define(`cnt', `%rcx')
+define(`s0', `%rax')
+define(`t0', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ LEA( ctz_table, %r10)
+ mov v0, t0
+ sub u0, t0
+ jz L(end)
+
+ ALIGN(16)
+L(top): mov u0, s0
+ sub v0, u0
+ cmovc t0, u0 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ and $MASK, R32(t0)
+ movzbl (%r10,t0), R32(cnt)
+ jz L(count_better)
+L(shr): shr R8(cnt), u0
+ mov v0, t0
+ sub u0, t0
+ jnz L(top)
+
+L(end): mov v0, %rax
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+
+L(count_better):
+ bsf u0, cnt
+ jmp L(shr)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm
new file mode 100644
index 0000000..c9f221e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h
new file mode 100644
index 0000000..977a209
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h
@@ -0,0 +1,230 @@
+/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 1600 MHz AMD Bobcat/Zacate */
+/* FFT tuning limit = 110,472,704 */
+/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 71
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 20
+
+#define DIV_1_VS_MUL_1_PERCENT 270
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 66
+#define MUL_TOOM44_THRESHOLD 190
+#define MUL_TOOM6H_THRESHOLD 274
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 127
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 278
+#define SQR_TOOM6_THRESHOLD 372
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 22
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 13
+
+#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 444, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \
+ { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \
+ { 71,10}, { 39, 9}, { 83, 5}, { 1343, 4}, \
+ { 2687, 5}, { 1407, 6}, { 735, 7}, { 415, 8}, \
+ { 223,10}, { 79,11}, { 47,10}, { 103,12}, \
+ { 31,11}, { 63,10}, { 135,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127,10}, { 255,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383,11}, { 207,10}, { 415,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,10}, { 575,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,12}, { 223,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 607,12}, { 319,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 607,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \
+ { 4479,15}, { 2303,14}, { 4991,15}, { 2559,14}, \
+ { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \
+ { 3327,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 380, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 25, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \
+ { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \
+ { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \
+ { 31,10}, { 63, 6}, { 1087, 7}, { 575, 8}, \
+ { 303, 9}, { 159,10}, { 103,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \
+ { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \
+ { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,12}, { 223,11}, { 447,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 607,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 831,13}, { 447,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 703,14}, \
+ { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \
+ { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \
+ { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \
+ { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \
+ { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \
+ { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4479,15}, \
+ { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \
+ { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \
+ { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 186
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 42
+#define MULLO_MUL_N_THRESHOLD 10950
+#define SQRLO_BASECASE_THRESHOLD 7
+#define SQRLO_DC_THRESHOLD 100
+#define SQRLO_SQR_THRESHOLD 7293
+
+#define DC_DIV_QR_THRESHOLD 70
+#define DC_DIVAPPR_Q_THRESHOLD 204
+#define DC_BDIV_QR_THRESHOLD 59
+#define DC_BDIV_Q_THRESHOLD 148
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 246
+#define INV_APPR_THRESHOLD 236
+
+#define BINV_NEWTON_THRESHOLD 252
+#define REDC_1_TO_REDC_2_THRESHOLD 67
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 1589
+#define MU_DIVAPPR_Q_THRESHOLD 1589
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 1,16,194,960,1603,1811,2499
+
+#define GET_STR_DC_THRESHOLD 20
+#define GET_STR_PRECOMPUTE_THRESHOLD 34
+#define SET_STR_DC_THRESHOLD 345
+#define SET_STR_PRECOMPUTE_THRESHOLD 1787
+
+#define FAC_DSC_THRESHOLD 781
+#define FAC_ODD_THRESHOLD 104
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD2_DIV1_METHOD 3 /* 3.20% faster than 5 */
+#define HGCD_THRESHOLD 110
+#define HGCD_APPR_THRESHOLD 50
+#define HGCD_REDUCE_THRESHOLD 2681
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 293
+#define JACOBI_BASE_METHOD 2 /* 9.38% faster than 1 */
+
+/* Tuneup completed successfully, took 358881 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm
new file mode 100644
index 0000000..4394d6e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm
@@ -0,0 +1,241 @@
+dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.53 old measurement
+C AMD K10 4.53 old measurement
+C AMD bd1 4.56 old measurement
+C AMD bd2 4.47 old measurement
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bt1 5.12
+C AMD bt2 5.17
+C Intel P4 12.6 old measurement
+C Intel PNR 4.53 old measurement
+C Intel NHM 4.36 old measurement
+C Intel SBR 3.0 old measurement
+C Intel IBR 2.55 old measurement
+C Intel HWL 2.28 old measurement
+C Intel BWL 2.36 old measurement
+C Intel SKL 2.39 old measurement
+C Intel atom 21.0 old measurement
+C Intel SLM 9 old measurement
+C Intel GLM ?
+C VIA nano ?
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+define(`v0', `%rcx')
+define(`cy', `%r8')
+C Standard allocations
+define(`n', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp', `%rcx') ') dnl
+IFDOS(` define(`up', `%rsi') ') dnl
+IFDOS(` define(`n_param', `%r8') ') dnl
+IFDOS(` define(`v0', `%r9') ') dnl
+IFDOS(` define(`cy', `56(%rsp)')') dnl
+C DOS64 allocations
+IFDOS(` define(`n', `%rbx') ') dnl
+IFDOS(` define(`w0', `%r8') ') dnl
+IFDOS(` define(`w1', `%rdi') ') dnl
+IFDOS(` define(`w2', `%r10') ') dnl
+IFDOS(` define(`w3', `%r11') ') dnl
+
+ ALIGN(64)
+PROLOGUE(mpn_mul_1)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(bx1)
+
+L(bx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(b00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(b11): mov %rax, w2
+ mov %rdx, w3
+ neg n
+ inc n
+ jmp L(L3)
+
+ ALIGN(16)
+L(bx1): mul v0
+ test $2, R8(n)
+ jne L(b11)
+
+L(b01): sub $3, n
+ jc L(n1)
+ mov %rax, w2
+ mov %rdx, w3
+ neg n
+
+ ALIGN(16)
+L(top): mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -24(rp,n,8)
+ add w3, w0
+ adc $0, w1
+L(L0): mov -8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+L(L2): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+ add $4, n
+ js L(top)
+
+L(end): mov w2, -8(rp)
+ mov w3, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+
+ ALIGN(32)
+L(n1): mov %rax, -8(rp)
+ mov %rdx, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_mul_1c)
+IFDOS(` push %rsi ')
+IFDOS(` push %rdi ')
+IFDOS(` mov %rdx, %rsi ')
+ mov cy, w2
+ push %rbx
+ mov (up), %rax
+
+ lea (rp,n_param,8), rp
+ lea (up,n_param,8), up
+ mov n_param, n
+
+ test $1, R8(n_param)
+ jne L(cx1)
+
+L(cx0): mul v0
+ neg n
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, w0
+ adc $0, w1
+ test $2, R8(n)
+ jne L(L2)
+
+L(c00): add $2, n
+ jmp L(L0)
+
+ ALIGN(16)
+L(cx1): mul v0
+ test $2, R8(n)
+ je L(c01)
+
+L(c11): neg n
+ inc n
+ add %rax, w2
+ mov %rdx, w3
+ adc $0, w3
+ jmp L(L3)
+
+L(c01): cmp $1, n
+ jz L(m1)
+ neg n
+ add $3, n
+ add %rax, w2
+ mov %rdx, w3
+ adc $0, w3
+ jmp L(top)
+
+ ALIGN(32)
+L(m1): add %rax, w2
+ mov %rdx, %rax
+ mov w2, -8(rp)
+ adc $0, %rax
+ pop %rbx
+IFDOS(` pop %rdi ')
+IFDOS(` pop %rsi ')
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm
new file mode 100644
index 0000000..e7d46bf
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm
@@ -0,0 +1,486 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 3 code.
+C * Fix slowdown for un=vn=3 (67->71) compared to default code.
+C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
+C combining addmul loops like that code. Tolerable slowdown?
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%r12')
+define(`w3', `%r13')
+define(`n', `%rbp')
+define(`v0', `%r9')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ mov (up), %rax
+ mov (vp), v0
+
+ cmp $2, un_param
+ ja L(ge3)
+ jz L(u2)
+
+ mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(u2): mul v0 C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ cmp $1, R32(vn)
+ jnz L(u2v2)
+ mov w0, 8(rp)
+ mov w1, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(u2v2):mov 8(vp), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, %r8 C CAUTION: r8 realloc
+ adc $0, %r8
+ mov 8(up), %rax
+ mul v0
+ add w1, %r8
+ adc $0, %rdx
+ add %r8, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(ge3): push %rbx
+ push %rbp
+ push %r12
+ push %r13
+
+ lea 8(vp), vp
+
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ xor R32(un), R32(un)
+ mov $2, R32(n)
+ sub un_param, un
+ sub un_param, n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(top)
+
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+
+C Switch on n into right addmul_l loop
+ test n, n
+ jz L(r2)
+ cmp $2, R32(n)
+ ja L(r3)
+ jz L(r0)
+ jmp L(r1)
+
+
+L(r3): mov w2, X(-8(rp,n,8),16(rp))
+ mov w3, X((rp,n,8),24(rp))
+ add $2, un
+
+C outer loop(3)
+L(to3): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 8(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al3)
+
+ ALIGN(16)
+L(ta3): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta3)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to3)
+
+
+L(r2): mov X(0(up,n,8),(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),-8(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),(rp))
+ add w1, w2
+ adc $0, w3
+ mov X(16(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X(16(rp,n,8),16(rp))
+ adc $0, w3
+ mov w1, X(24(rp,n,8),24(rp))
+ inc un
+
+C outer loop(2)
+L(to2): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov 16(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al2)
+
+ ALIGN(16)
+L(ta2): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al2): mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta2)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to2)
+
+
+L(r1): mov X(0(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),8(rp))
+ add w1, w2
+ adc $0, w3
+ mov w2, X(8(rp,n,8),16(rp))
+ mov w3, X(16(rp,n,8),24(rp))
+ add $4, un
+
+C outer loop(1)
+L(to1): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov -8(up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al1)
+
+ ALIGN(16)
+L(ta1): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al1): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta1)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to1)
+
+
+L(r0): mov X((up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X((rp,n,8),16(rp))
+ mov w1, X(8(rp,n,8),24(rp))
+ add $3, un
+
+C outer loop(0)
+L(to0): dec vn
+ jz L(ret)
+ mov (vp), v0
+ mov (up,un,8), %rax
+ lea 8(vp), vp
+ lea 8(rp), rp
+ mov un, n
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al0)
+
+ ALIGN(16)
+L(ta0): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al0): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta0)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(to0)
+
+
+L(ret): pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm
new file mode 100644
index 0000000..d55b1e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm
@@ -0,0 +1,507 @@
+dnl X86-64 mpn_redc_1 optimised for AMD bobcat.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat 5.0
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+define(`w0', `%rbp')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea (up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(bx0)
+
+L(bx1): test $2, R8(n)
+ jz L(b3)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+
+L(otp1):lea 1(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e1)
+
+ ALIGNx
+L(tp1): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e1): mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp1)
+
+L(ed1): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b3): cmp $-3, R32(n)
+ jz L(n3)
+
+L(otp3):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w1
+ add (up,n,8), w2
+ adc w3, %rbx
+ adc $0, w1
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w1, w2
+ adc $0, w3
+ imul u0inv, %rbx C next q limb
+ jmp L(e3)
+
+ ALIGNx
+L(tp3): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+L(e3): mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp3)
+
+L(ed3): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp3)
+C jmp L(cj)
+
+L(cj):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(bx0): test $2, R8(n)
+ jnz L(b2)
+
+L(b0):
+L(otp0):lea (n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w3
+ add (up,n,8), w0
+ adc w1, %rbx
+ adc $0, w3
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w3, w0
+ adc $0, w1
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+L(e0): mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp0)
+
+L(ed0): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+ jmp L(cj)
+
+L(b2): cmp $-2, R32(n)
+ jz L(n2)
+
+L(otp2):lea 2(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 8(mp,n,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, w3
+ add (up,n,8), w0
+ adc w1, %rbx
+ adc $0, w3
+ mov 16(mp,n,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add 8(up,n,8), %rbx
+ mov %rbx, 8(up,n,8)
+ adc w3, w0
+ adc $0, w1
+ imul u0inv, %rbx C next q limb
+ jmp L(e2)
+
+ ALIGNx
+L(tp2): add w0, -16(up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov (mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+L(e2): mov 8(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (up,i,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(mp,i,8), %rax
+ mul q0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(up,i,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(mp,i,8), %rax
+ mul q0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, i
+ js L(tp2)
+
+L(ed2): add w0, I(-16(up),-16(up,i,8))
+ adc w1, w2
+ adc $0, w3
+ add w2, I(-8(up),-8(up,i,8))
+ adc $0, w3
+ mov w3, (up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp2)
+ jmp L(cj)
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add -8(up), %rax
+ adc (up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov (up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 8(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -24(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov -16(up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov -8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, -16(up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, -8(up)
+ mov %r11, -24(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -48(up), %rdx
+ mov -40(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc -8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm
new file mode 100644
index 0000000..0e417a1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm
@@ -0,0 +1,565 @@
+dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.5
+C AMD K10 4.5
+C AMD bd1 4.75
+C AMD bobcat 5
+C Intel P4 17.7
+C Intel core2 5.5
+C Intel NHM 5.43
+C Intel SBR 3.92
+C Intel atom 23
+C VIA nano 5.63
+
+C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8. The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations. Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C * Tune un < 4 code.
+C * Perhaps implement a larger final corner (it is now 2 x 1).
+C * Lots of space could be saved by replacing the "switch" code by gradual
+C jumps out from mul_1 winddown code, perhaps with no added overhead.
+C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+C Standard allocations
+define(`un', `%rbx')
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+define(`n', `%rbp')
+define(`v0', `%rcx')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+dnl define(`X',`$1')
+
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ mov (up), %rax
+
+ cmp $2, R32(un_param)
+ jae L(ge2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(ge2): mov (up), v0
+ jnz L(g2)
+
+ mul %rax
+ mov %rax, (rp)
+ mov 8(up), %rax
+ mov %rdx, w0
+ mul v0
+ add %rax, w0
+ mov %rdx, w1
+ adc $0, w1
+ mov 8(up), v0
+ mov (up), %rax
+ mul v0
+ add %rax, w0
+ mov w0, 8(rp)
+ mov %rdx, w0 C CAUTION: r8 realloc
+ adc $0, w0
+ mov 8(up), %rax
+ mul v0
+ add w1, w0
+ adc $0, %rdx
+ add w0, %rax
+ adc $0, %rdx
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(g2): cmp $3, R32(un_param)
+ ja L(g3)
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov (up), v0
+ mov 8(up), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov 16(up), %rax
+ mul v0
+ xor R32(w2), R32(w2)
+ add %rax, w1
+ adc %rdx, w2
+
+ mov 8(up), v0
+ mov 16(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, w2
+ adc %rdx, w3
+ add w0, w0
+ adc w1, w1
+ adc w2, w2
+ adc w3, w3
+ mov $0, R32(v0)
+ adc v0, v0
+ add w0, 8(rp)
+ adc w1, 16(rp)
+ adc w2, 24(rp)
+ adc w3, 32(rp)
+ adc v0, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(g3): push %rbx
+ push %rbp
+
+ mov 8(up), %rax
+ lea -24(rp,un_param,8), rp
+ lea -24(up,un_param,8), up
+ neg un_param
+ push un_param C for sqr_diag_addlsh1
+ lea (un_param), un
+ lea 3(un_param), n
+
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(top): mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, -8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, (rp,n,8)
+ add w1, w2
+ adc $0, w3
+L(L3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, 8(rp,n,8)
+ add w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(top)
+
+ mov w0, -16(rp,n,8)
+ add w1, w2
+ adc $0, w3
+
+ test n, n
+ jz L(r2)
+ cmp $2, R32(n)
+ ja L(r3)
+ jz L(r0)
+
+
+L(r1): mov X((up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),8(rp))
+ add w1, w2
+ adc $0, w3
+ mov w2, X(8(rp,n,8),16(rp))
+ mov w3, X(16(rp,n,8),24(rp))
+ add $5, un
+ jmp L(to0)
+
+L(r2): mov X((up,n,8),(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),-8(rp))
+ add w3, w0
+ adc $0, w1
+ mov X(8(up,n,8),8(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ mov w0, X((rp,n,8),(rp))
+ add w1, w2
+ adc $0, w3
+ mov X(16(up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X(16(rp,n,8),16(rp))
+ adc $0, w3
+ mov w1, X(24(rp,n,8),24(rp))
+ add $6, un
+ jmp L(to1)
+
+L(r3): mov w2, X(-8(rp,n,8),16(rp))
+ mov w3, X((rp,n,8),24(rp))
+ add $3, un
+ jmp L(to2)
+
+L(r0): mov X((up,n,8),16(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov w2, X(-8(rp,n,8),8(rp))
+ add w3, w0
+ adc $0, w1
+ mov w0, X((rp,n,8),16(rp))
+ mov w1, X(8(rp,n,8),24(rp))
+ add $4, un
+C jmp L(to3)
+C fall through into main loop
+
+
+L(outer):
+ mov un, n
+ mov (up,un,8), v0
+ mov 8(up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al3)
+
+ ALIGN(16)
+L(ta3): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al3): mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta3)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to2): mov un, n
+ cmp $-4, R32(un)
+ jnc L(end)
+ add $4, un
+ mov 8(up,n,8), v0
+ mov 16(up,n,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al2)
+
+ ALIGN(16)
+L(ta2): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al2): mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta2)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to1): mov un, n
+ mov -16(up,un,8), v0
+ mov -8(up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ jmp L(al1)
+
+ ALIGN(16)
+L(ta1): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+L(al1): mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta1)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+
+
+L(to0): mov un, n
+ mov -8(up,un,8), v0
+ mov (up,un,8), %rax
+ lea 8(rp), rp
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ jmp L(al0)
+
+ ALIGN(16)
+L(ta0): add w0, -16(rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov (up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, -8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+L(al0): mov 8(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, (rp,n,8)
+ adc w1, w2
+ adc $0, w3
+ mov 16(up,n,8), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ add w2, 8(rp,n,8)
+ adc w3, w0
+ adc $0, w1
+ mov 24(up,n,8), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add $4, n
+ js L(ta0)
+
+ add w0, X(-16(rp,n,8),8(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(-8(rp,n,8),16(rp))
+ adc $0, w3
+ mov w3, X((rp,n,8),24(rp))
+ jmp L(outer)
+
+
+L(end): mov X(8(up,un,8),(up)), v0
+ mov X(16(up,un,8),8(up)), %rax
+ mul v0
+ mov %rax, w0
+ mov %rdx, w1
+ mov X(24(up,un,8),16(up)), %rax
+ mul v0
+ mov %rax, w2
+ mov %rdx, w3
+ add w0, X(24(rp,un,8),16(rp))
+ adc w1, w2
+ adc $0, w3
+ add w2, X(32(rp,un,8),24(rp))
+ adc $0, w3
+ mov X(16(up,un,8),8(up)), v0
+ mov X(24(up,un,8),16(up)), %rax
+ mul v0
+ add %rax, w3
+ mov w3, X(40(rp,un,8),32(rp))
+ adc $0, %rdx
+ mov %rdx, X(48(rp,un,8),40(rp))
+
+
+C sqr_diag_addlsh1
+
+ lea 16(up), up
+ lea 40(rp), rp
+ pop n
+ lea 2(n,n), n
+
+ mov (up,n,4), %rax
+ mul %rax
+ xor R32(w2), R32(w2)
+
+ mov 8(rp,n,8), w0
+ mov %rax, (rp,n,8)
+ jmp L(lm)
+
+ ALIGN(8)
+L(tsd): add %rbx, w0
+ adc %rax, w1
+ mov w0, -8(rp,n,8)
+ mov 8(rp,n,8), w0
+ mov w1, (rp,n,8)
+L(lm): mov 16(rp,n,8), w1
+ adc w0, w0
+ adc w1, w1
+ lea (%rdx,w2), %rbx
+ mov 8(up,n,4), %rax
+ setc R8(w2)
+ mul %rax
+ add $2, n
+ js L(tsd)
+
+L(esd): add %rbx, w0
+ adc %rax, w1
+ mov w0, X(-8(rp,n,8),-8(rp))
+ mov w1, X((rp,n,8),(rp))
+ adc w2, %rdx
+ mov %rdx, X(8(rp,n,8),8(rp))
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()