aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/core2
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/core2')
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm53
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm53
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm225
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aors_n.asm150
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm188
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm243
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm93
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm137
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h222
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/hamdist.asm210
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/logops_n.asm285
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/lshift.asm145
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm159
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm975
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm427
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/popcount.asm185
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/redc_1.asm430
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm169
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/rshift.asm143
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm984
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm47
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm47
-rw-r--r--gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm158
28 files changed, 5914 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm
new file mode 100644
index 0000000..7066bb4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm
@@ -0,0 +1,53 @@
+dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm
new file mode 100644
index 0000000..5065120
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm
@@ -0,0 +1,53 @@
+dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func, mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func, mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm
new file mode 100644
index 0000000..57abf31
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm
new file mode 100644
index 0000000..3f875ae
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm
@@ -0,0 +1,225 @@
+dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 4.14
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`ep', `%rcx')
+define(`yp', `%r8')
+define(`n', `%r9')
+define(`cy_param', `8(%rsp)')
+
+define(`el', `%rbx')
+define(`eh', `%rbp')
+define(`t0', `%r10')
+define(`t1', `%r11')
+define(`t2', `%r12')
+define(`t3', `%r13')
+define(`w0', `%r14')
+define(`w1', `%r15')
+
+ifdef(`OPERATION_add_err1_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_err1_n)')
+ifdef(`OPERATION_sub_err1_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_err1_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ mov cy_param, %rax
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+
+ mov R32(n), R32(%r10)
+ and $3, R32(%r10)
+ jz L(0mod4)
+ cmp $2, R32(%r10)
+ jc L(1mod4)
+ jz L(2mod4)
+L(3mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ xor R32(t1), R32(t1)
+ lea -24(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 16(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc 8(yp), t0
+ mov 16(up,n,8), w0
+ ADCSBB 16(vp,n,8), w0
+ mov w0, 16(rp,n,8)
+ cmovc (yp), t1
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+
+ add $3, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(0mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea (yp,n,8), yp
+ neg n
+ jmp L(loop)
+
+ ALIGN(16)
+L(1mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ lea -8(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc (yp), el
+ setc %al C save carry
+
+ add $1, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(16)
+L(2mod4):
+ xor R32(el), R32(el)
+ xor R32(eh), R32(eh)
+ xor R32(t0), R32(t0)
+ lea -16(yp,n,8), yp
+ neg n
+
+ shr $1, %al C restore carry
+ mov (up,n,8), w0
+ mov 8(up,n,8), w1
+ ADCSBB (vp,n,8), w0
+ mov w0, (rp,n,8)
+ cmovc 8(yp), el
+ ADCSBB 8(vp,n,8), w1
+ mov w1, 8(rp,n,8)
+ cmovc (yp), t0
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+
+ add $2, n
+ jnz L(loop)
+ jmp L(end)
+
+ ALIGN(32)
+L(loop):
+ mov (up,n,8), w0
+ shr $1, %al C restore carry
+ mov -8(yp), t0
+ mov $0, R32(t3)
+ ADCSBB (vp,n,8), w0
+ cmovnc t3, t0
+ mov w0, (rp,n,8)
+ mov 8(up,n,8), w1
+ mov 16(up,n,8), w0
+ ADCSBB 8(vp,n,8), w1
+ mov -16(yp), t1
+ cmovnc t3, t1
+ mov -24(yp), t2
+ mov w1, 8(rp,n,8)
+ ADCSBB 16(vp,n,8), w0
+ cmovnc t3, t2
+ mov 24(up,n,8), w1
+ ADCSBB 24(vp,n,8), w1
+ cmovc -32(yp), t3
+ setc %al C save carry
+ add t0, el
+ adc $0, eh
+ add t1, el
+ adc $0, eh
+ add t2, el
+ adc $0, eh
+ lea -32(yp), yp
+ mov w0, 16(rp,n,8)
+ add t3, el
+ adc $0, eh
+ add $4, n
+ mov w1, -8(rp,n,8)
+ jnz L(loop)
+
+L(end):
+ mov el, (ep)
+ mov eh, 8(ep)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm
new file mode 100644
index 0000000..f9e0039
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm
@@ -0,0 +1,150 @@
+dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+
+dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2
+C AMD K10 1.93\2
+C AMD bull 1.62\2.1
+C AMD pile 1.6\1.7
+C AMD steam
+C AMD excavator
+C AMD bobcat 2.79
+C AMD jaguar 2.54
+C Intel P4 10
+C Intel core2 2
+C Intel NHM 2
+C Intel SBR 2
+C Intel IBR 1.95
+C Intel HWL 1.72
+C Intel BWL 1.54
+C Intel SKL 1.52
+C Intel atom 9
+C Intel SLM 6.5
+C VIA nano 3
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cy', `%r8')
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ xor %r8, %r8
+L(start):
+ mov (up), %r10
+ mov (vp), %r11
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+ mov R32(n), R32(%rax)
+ neg n
+ and $3, R32(%rax)
+ je L(b00)
+ add %rax, n C clear low rcx bits for jrcxz
+ cmp $2, R32(%rax)
+ jl L(b01)
+ je L(b10)
+
+L(b11): neg %r8 C set cy
+ jmp L(e11)
+
+L(b00): neg %r8 C set cy
+ mov %r10, %r8
+ mov %r11, %r9
+ lea 4(n), n
+ jmp L(e00)
+
+ nop
+ nop
+ nop
+L(b01): neg %r8 C set cy
+ jmp L(top)
+
+L(b10): neg %r8 C set cy
+ mov %r10, %r8
+ mov %r11, %r9
+ jmp L(e10)
+
+L(end): ADCSBB %r11, %r10
+ mov %r10, -8(rp)
+ mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0
+ adc R32(%rax), R32(%rax)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(top): jrcxz L(end)
+ mov (up,n,8), %r8
+ mov (vp,n,8), %r9
+ lea 4(n), n
+ ADCSBB %r11, %r10
+ mov %r10, -40(rp,n,8)
+L(e00): mov -24(up,n,8), %r10
+ mov -24(vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, -32(rp,n,8)
+L(e11): mov -16(up,n,8), %r8
+ mov -16(vp,n,8), %r9
+ ADCSBB %r11, %r10
+ mov %r10, -24(rp,n,8)
+L(e10): mov -8(up,n,8), %r10
+ mov -8(vp,n,8), %r11
+ ADCSBB %r9, %r8
+ mov %r8, -16(rp,n,8)
+ jmp L(top)
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(start)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm
new file mode 100644
index 0000000..a7a5d6e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm
@@ -0,0 +1,188 @@
+dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
+
+dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 4.52
+C AMD K10 4.01
+C AMD bull 4.98
+C AMD pile 4.83
+C AMD steam
+C AMD excavator
+C AMD bobcat 5.56
+C AMD jaguar 5.54
+C Intel P4 16.3 17.3
+C Intel core2 4.32 4.61
+C Intel NHM 5.08
+C Intel SBR 4.04
+C Intel IBR 3.95
+C Intel HWL 3.66
+C Intel BWL 2.87
+C Intel SKL 2.79
+C Intel atom 20.6
+C Intel SLM 7.6
+C VIA nano 5.25
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`v0', `%rcx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`func', `mpn_addmul_1')
+ define(`func_1c', `mpn_addmul_1c')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`func', `mpn_submul_1')
+ define(`func_1c', `mpn_submul_1c')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ C For DOS, on the stack we have four saved registers, return address,
+ C space for four register arguments, and finally the carry input.
+
+IFDOS(` define(`carry_in', `72(%rsp)')') dnl
+IFSTD(` define(`carry_in', `%r8')') dnl
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_1c)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ lea (%rdx), %rbx
+ neg %rbx
+
+ mov (up), %rax
+ mov (rp), %r10
+
+ lea -16(rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+ add carry_in, %rax
+ adc $0, %rdx
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+ lea (%rdx), %rbx
+ neg %rbx
+
+ mov (up), %rax
+ mov (rp), %r10
+
+ lea -16(rp,%rdx,8), rp
+ lea (up,%rdx,8), up
+ mul %rcx
+
+L(start_nc):
+ test $1, R8(%rbx)
+ jnz L(odd)
+
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ lea (%rdx), %rbp
+ mul %rcx
+ add $2, %rbx
+ jz L(n2)
+
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ lea (%rdx), %r9
+ jmp L(mid)
+
+ ALIGN(8)
+L(odd): inc %rbx
+ jz L(n1)
+
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ lea (%rdx), %r9
+ mul %rcx
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ lea (%rdx), %rbp
+ jmp L(e)
+
+ ALIGN(16)
+L(top): mul %rcx
+ ADDSUB %r8, %r10
+ lea (%rax), %r8
+ mov (up,%rbx,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(rp,%rbx,8)
+ mov (rp,%rbx,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rbp
+L(mid): mul %rcx
+ ADDSUB %r11, %r10
+ lea (%rax), %r11
+ mov 8(up,%rbx,8), %rax
+ adc %rbp, %r8
+ mov %r10, (rp,%rbx,8)
+ mov 8(rp,%rbx,8), %r10
+ lea (%rdx), %rbp
+ adc $0, %r9
+L(e): add $2, %rbx
+ js L(top)
+
+ mul %rcx
+ ADDSUB %r8, %r10
+ adc %r9, %r11
+ mov %r10, -8(rp)
+ adc %rbx, %rbp C rbx = 0
+L(n2): mov (rp), %r10
+ ADDSUB %r11, %r10
+ adc %rbp, %rax
+ mov %r10, (rp)
+ adc %rbx, %rdx C rbx = 0
+L(n1): mov 8(rp), %r10
+ ADDSUB %rax, %r10
+ mov %r10, 8(rp)
+ mov R32(%rbx), R32(%rax) C rbx = 0
+ adc %rdx, %rax
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/com.asm b/gmp-6.3.0/mpn/x86_64/core2/com.asm
new file mode 100644
index 0000000..d7d9f79
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyd.asm b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm
new file mode 100644
index 0000000..57ea0e5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyi.asm b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm
new file mode 100644
index 0000000..f0c7607
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm
new file mode 100644
index 0000000..1b3f139
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm
@@ -0,0 +1,243 @@
+dnl x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C norm unorm frac
+C AMD K8,K9 15 15 12
+C AMD K10 15 15 12
+C Intel P4 44 44 43
+C Intel core2 24 24 19.5
+C Intel corei 19 19 18
+C Intel atom 51 51 36
+C VIA nano 46 44 22.5
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn_param', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`d', `%r8')
+define(`dinv', `%r9') C only for mpn_preinv_divrem_1
+C shift passed on stack C only for mpn_preinv_divrem_1
+
+define(`cnt', `%rcx')
+define(`up', `%rsi')
+define(`fn', `%r12')
+define(`un', `%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C cnt qp d dinv
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFSTD(`define(`CNTOFF', `40($1)')')
+IFDOS(`define(`CNTOFF', `104($1)')')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+
+ lea -8(qp,un_param,8), qp
+
+ mov CNTOFF(%rsp), R8(cnt)
+ shl R8(cnt), d
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ xor R32(%rax), R32(%rax)
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov fn_param, fn
+ mov un_param, un
+ add fn_param, un_param
+ mov up_param, up
+ je L(ret)
+
+ lea -8(qp,un_param,8), qp
+ xor R32(%rbp), R32(%rbp)
+
+L(unnormalized):
+ test un, un
+ je L(44)
+ mov -8(up,un,8), %rax
+ cmp d, %rax
+ jae L(44)
+ mov %rbp, (qp)
+ mov %rax, %rbp
+ lea -8(qp), qp
+ je L(ret)
+ dec un
+L(44):
+ bsr d, %rcx
+ not R32(%rcx)
+ sal R8(%rcx), d
+ sal R8(%rcx), %rbp
+
+ push %rcx
+IFSTD(` push %rdi ')
+IFSTD(` push %rsi ')
+ push %r8
+IFSTD(` sub $8, %rsp ')
+IFSTD(` mov d, %rdi ')
+IFDOS(` sub $40, %rsp ')
+IFDOS(` mov d, %rcx ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+ pop %r8
+IFSTD(` pop %rsi ')
+IFSTD(` pop %rdi ')
+ pop %rcx
+
+ mov %rax, dinv
+ mov %rbp, %rax
+ test un, un
+ je L(frac)
+
+L(ent): mov -8(up,un,8), %rbp
+ shr R8(%rcx), %rax
+ shld R8(%rcx), %rbp, %rax
+ sub $2, un
+ js L(end)
+
+ ALIGN(16)
+L(top): lea 1(%rax), %r11
+ mul dinv
+ mov (up,un,8), %r10
+ shld R8(%rcx), %r10, %rbp
+ mov %rbp, %r13
+ add %rax, %r13
+ adc %r11, %rdx
+ mov %rdx, %r11
+ imul d, %rdx
+ sub %rdx, %rbp
+ lea (d,%rbp), %rax
+ sub $8, qp
+ cmp %r13, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r11
+ cmp d, %rax
+ jae L(ufx)
+L(uok): dec un
+ mov %r11, 8(qp)
+ mov %r10, %rbp
+ jns L(top)
+
+L(end): lea 1(%rax), %r11
+ sal R8(%rcx), %rbp
+ mul dinv
+ add %rbp, %rax
+ adc %r11, %rdx
+ mov %rax, %r11
+ mov %rdx, %r13
+ imul d, %rdx
+ sub %rdx, %rbp
+ mov d, %rax
+ add %rbp, %rax
+ cmp %r11, %rbp
+ cmovc %rbp, %rax
+ adc $-1, %r13
+ cmp d, %rax
+ jae L(efx)
+L(eok): mov %r13, (qp)
+ sub $8, qp
+ jmp L(frac)
+
+L(ufx): sub d, %rax
+ inc %r11
+ jmp L(uok)
+L(efx): sub d, %rax
+ inc %r13
+ jmp L(eok)
+
+L(frac):mov d, %rbp
+ neg %rbp
+ jmp L(fent)
+
+ ALIGN(16) C K8-K10 P6-CNR P6-NHM P4
+L(ftop):mul dinv C 0,12 0,17 0,17
+ add %r11, %rdx C 5 8 10
+ mov %rax, %r11 C 4 8 3
+ mov %rdx, %r13 C 6 9 11
+ imul %rbp, %rdx C 6 9 11
+ mov d, %rax C
+ add %rdx, %rax C 10 14 14
+ cmp %r11, %rdx C 10 14 14
+ cmovc %rdx, %rax C 11 15 15
+ adc $-1, %r13 C
+ mov %r13, (qp) C
+ sub $8, qp C
+L(fent):lea 1(%rax), %r11 C
+ dec fn C
+ jns L(ftop) C
+
+ shr R8(%rcx), %rax
+L(ret): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm
new file mode 100644
index 0000000..b00451f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm
@@ -0,0 +1,93 @@
+dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR 4.22 *
+C Intel PNR 4.22 *
+C Intel NHM 4.97
+C Intel WSM 5.17
+C Intel SBR 4.83 *
+C Intel IBR 4.16 *
+C Intel HWL 3.84
+C Intel BWL 3.76
+C Intel SKL 3.83
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+define(`u0', `%rdi')
+define(`v0', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+ FUNC_ENTRY(2)
+ jmp L(odd)
+
+ ALIGN(16)
+L(top): cmovc %rdx, u0 C u = |u - v|
+ cmovc %rax, v0 C v = min(u,v)
+ shr R8(%rcx), u0
+L(odd): mov v0, %rdx
+ sub u0, %rdx C v - u
+ bsf %rdx, %rcx
+ mov u0, %rax
+ sub v0, u0 C u - v
+ jnz L(top)
+
+L(end): C rax = result
+ C rdx = 0 for the benefit of internal gcd_22 call
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm
new file mode 100644
index 0000000..b5aa73b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm
@@ -0,0 +1,137 @@
+dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD bt1 ?
+C AMD bt2 ?
+C AMD zn1 ?
+C AMD zn2 ?
+C Intel P4 ?
+C Intel CNR 8.7
+C Intel PNR 8.7
+C Intel NHM 9.2
+C Intel WSM 9.2
+C Intel SBR 9.1
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C Intel GLM ?
+C Intel GLM+ ?
+C VIA nano ?
+
+
+define(`u1', `%rdi')
+define(`u0', `%rsi')
+define(`v1', `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0', `%rax')
+define(`cnt', `%rcx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`t0', `%r10')
+define(`t1', `%r11')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+ FUNC_ENTRY(4)
+ mov v0_param, v0
+
+ ALIGN(16)
+L(top): mov v0, t0
+ sub u0, t0
+ jz L(lowz) C jump when low limb result = 0
+ mov v1, t1
+ sbb u1, t1
+
+ mov u0, s0
+ mov u1, s1
+
+ bsf t0, cnt
+
+ sub v0, u0
+ sbb v1, u1
+
+L(bck): cmovc t0, u0 C u = |u - v|
+ cmovc t1, u1 C u = |u - v|
+ cmovc s0, v0 C v = min(u,v)
+ cmovc s1, v1 C v = min(u,v)
+
+ shrd R8(cnt), u1, u0
+ shr R8(cnt), u1
+
+ mov v1, t1
+ or u1, t1
+ jnz L(top)
+
+L(gcd_11):
+ mov v0, %rdi
+C mov u0, %rsi
+ TCALL( mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+ C 1. If v1 - u1 = 0, then gcd is u = v.
+ C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+ mov v1, t0
+ sub u1, t0
+ je L(end)
+
+ xor t1, t1
+ mov u0, s0
+ mov u1, s1
+ bsf t0, cnt
+ mov u1, u0
+ xor u1, u1
+ sub v1, u0
+ jmp L(bck)
+
+L(end): C mov v0, %rax
+ C mov v1, %rdx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h
new file mode 100644
index 0000000..44f1494
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h
@@ -0,0 +1,222 @@
+/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 116,220,984 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 3
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 16
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 26
+
+#define DIV_1_VS_MUL_1_PERCENT 284
+
+#define MUL_TOOM22_THRESHOLD 24
+#define MUL_TOOM33_THRESHOLD 65
+#define MUL_TOOM44_THRESHOLD 184
+#define MUL_TOOM6H_THRESHOLD 256
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 28
+#define SQR_TOOM3_THRESHOLD 102
+#define SQR_TOOM4_THRESHOLD 160
+#define SQR_TOOM6_THRESHOLD 366
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 32
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 368, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \
+ { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \
+ { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \
+ { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \
+ { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 79,11}, { 47,10}, { 95,12}, { 31, 9}, \
+ { 255,10}, { 135,11}, { 79,10}, { 159, 9}, \
+ { 319,11}, { 95,10}, { 191, 9}, { 383,11}, \
+ { 111,12}, { 63,11}, { 127,10}, { 271,11}, \
+ { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \
+ { 159,10}, { 319,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,13}, { 63,12}, \
+ { 127,11}, { 271,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 415,12}, { 223,11}, { 479,13}, \
+ { 127,12}, { 255,11}, { 543,12}, { 287,11}, \
+ { 607,12}, { 319,11}, { 639,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 479,14}, { 127,13}, \
+ { 255,12}, { 575,13}, { 319,12}, { 703,13}, \
+ { 383,12}, { 799,13}, { 447,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \
+ { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \
+ { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \
+ { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \
+ { 4863,14}, { 1279,13}, { 2559,14}, { 1407,13}, \
+ { 2815,15}, { 767,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,16}, { 511,15}, { 1023,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \
+ { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \
+ { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \
+ { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \
+ { 4991,15}, { 2815,14}, { 5887,13}, { 11775,16}, \
+ { 1535,15}, { 3327,14}, { 6911,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 176
+#define MUL_FFT_THRESHOLD 4736
+
+#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 308, 5}, { 17, 6}, { 23, 7}, { 12, 6}, \
+ { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \
+ { 31,10}, { 79,11}, { 47,12}, { 31,11}, \
+ { 63,10}, { 127, 9}, { 255,11}, { 79,10}, \
+ { 159, 6}, { 2559, 7}, { 1343, 6}, { 2687, 7}, \
+ { 1407, 9}, { 383,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319,11}, { 175,12}, { 95,11}, { 191,10}, \
+ { 383,11}, { 207,10}, { 415,13}, { 63,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \
+ { 543,11}, { 287,10}, { 575,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,12}, { 223,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 543,12}, \
+ { 287,11}, { 575,12}, { 319,11}, { 639,12}, \
+ { 351,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 575,13}, \
+ { 319,12}, { 703,13}, { 383,12}, { 799,13}, \
+ { 447,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \
+ { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \
+ { 1535,13}, { 959,15}, { 255,14}, { 511,13}, \
+ { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \
+ { 1599,12}, { 3199,13}, { 1663,14}, { 895,15}, \
+ { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \
+ { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \
+ { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,16}, { 511,15}, { 1023,14}, { 2303,13}, \
+ { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \
+ { 2943,13}, { 5887,12}, { 11775,15}, { 1535,14}, \
+ { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \
+ { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2815,14}, { 5887,13}, \
+ { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \
+ { 3327,14}, { 6911,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 183
+#define SQR_FFT_THRESHOLD 3520
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 67
+#define MULLO_MUL_N_THRESHOLD 9174
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 11
+#define SQRLO_SQR_THRESHOLD 7035
+
+#define DC_DIV_QR_THRESHOLD 53
+#define DC_DIVAPPR_Q_THRESHOLD 163
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 76
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 158
+#define INV_APPR_THRESHOLD 167
+
+#define BINV_NEWTON_THRESHOLD 248
+#define REDC_1_TO_REDC_N_THRESHOLD 44
+
+#define MU_DIV_QR_THRESHOLD 1187
+#define MU_DIVAPPR_Q_THRESHOLD 1210
+#define MUPI_DIV_QR_THRESHOLD 73
+#define MU_BDIV_QR_THRESHOLD 1017
+#define MU_BDIV_Q_THRESHOLD 1187
+
+#define POWM_SEC_TABLE 1,64,105,579,1486
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 17
+#define SET_STR_DC_THRESHOLD 134
+#define SET_STR_PRECOMPUTE_THRESHOLD 1752
+
+#define FAC_DSC_THRESHOLD 351
+#define FAC_ODD_THRESHOLD 27
+
+#define MATRIX22_STRASSEN_THRESHOLD 18
+#define HGCD2_DIV1_METHOD 3 /* 2.14% faster than 5 */
+#define HGCD_THRESHOLD 118
+#define HGCD_APPR_THRESHOLD 161
+#define HGCD_REDUCE_THRESHOLD 2121
+#define GCD_DC_THRESHOLD 416
+#define GCDEXT_DC_THRESHOLD 351
+#define JACOBI_BASE_METHOD 4 /* 3.56% faster than 1 */
+
+/* Tuneup completed successfully, took 132491 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm
new file mode 100644
index 0000000..ded7b67
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm
@@ -0,0 +1,210 @@
+dnl AMD64 SSSE3 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 ?
+C AMD bd2 ?
+C AMD bd3 ?
+C AMD bd4 ?
+C AMD zen ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 n/a
+C Intel CNR 4.50 y
+C Intel PNR 3.28 y
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom ?
+C Intel SLM ?
+C VIA nano ?
+
+C TODO
+C * This was hand-written without too much thought about optimal insn
+C selection; check to see of it can be improved.
+C * Consider doing some instruction scheduling.
+
+define(`up', `%rdi')
+define(`vp', `%rsi')
+define(`n', `%rdx')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_hamdist)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+ `define(`OFF1',64) define(`OFF2',80)')
+ movdqa OFF1`'(%r9), %xmm7
+ movdqa OFF2`'(%r9), %xmm6
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm8, %xmm8
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): movq (up), %xmm1
+ add $8, up
+ movq (vp), %xmm10
+ add $8, vp
+ pxor %xmm10, %xmm1
+ jmp L(e1)
+
+L(2): add $-48, up
+ add $-48, vp
+ jmp L(e2)
+
+L(3): movq (up), %xmm1
+ add $-40, up
+ movq (vp), %xmm10
+ add $-40, vp
+ pxor %xmm10, %xmm1
+ jmp L(e3)
+
+L(4): add $-32, up
+ add $-32, vp
+ jmp L(e4)
+
+L(5): movq (up), %xmm1
+ add $-24, up
+ movq (vp), %xmm10
+ add $-24, vp
+ pxor %xmm10, %xmm1
+ jmp L(e5)
+
+L(6): add $-16, up
+ add $-16, vp
+ jmp L(e6)
+
+L(7): movq (up), %xmm1
+ add $-8, up
+ movq (vp), %xmm10
+ add $-8, vp
+ pxor %xmm10, %xmm1
+ jmp L(e7)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm1
+ lddqu (vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e7): movdqa %xmm6, %xmm0 C copy mask register
+ movdqa %xmm7, %xmm2 C copy count register
+ movdqa %xmm7, %xmm3 C copy count register
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm1
+ lddqu 16(vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e5): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm1
+ lddqu 32(vp), %xmm10
+ pxor %xmm10, %xmm1
+L(e3): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e2): lddqu 48(up), %xmm1
+ add $64, up
+ lddqu 48(vp), %xmm10
+ add $64, vp
+ pxor %xmm10, %xmm1
+L(e1): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts
+ paddb %xmm2, %xmm3
+ paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts
+ movdqa %xmm3, %xmm4
+ sub $8, n
+ jg L(top)
+
+ psadbw %xmm5, %xmm4
+ paddq %xmm4, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
diff --git a/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm
new file mode 100644
index 0000000..5ff174c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm
@@ -0,0 +1,285 @@
+dnl AMD64 logops.
+
+dnl Copyright 2004-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l c/l c/l good
+C var-1 var-2 var-3 for cpu?
+C AMD K8,K9
+C AMD K10 1.52 1.75 1.75 n
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD bt1 2.67 ~2.79 ~2.79 =
+C AMD bt2 2.15 2.65 2.65 n
+C AMD zen 1.5 1.5 1.5 =
+C Intel P4
+C Intel PNR 2.0 2.0 2.0 =
+C Intel NHM 2.0 2.0 2.0 =
+C Intel SBR 1.5 1.5 1.5 y
+C Intel IBR 1.47 1.48 1.48 y
+C Intel HWL 1.11 1.35 1.35 y
+C Intel BWL 1.09 1.30 1.30 y
+C Intel SKL 1.21 1.27 1.27 y
+C Intel atom 3.31 3.57 3.57 y
+C Intel SLM 3.0 3.0 3.0 =
+C VIA nano
+
+ifdef(`OPERATION_and_n',`
+ define(`func',`mpn_and_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_andn_n',`
+ define(`func',`mpn_andn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_nand_n',`
+ define(`func',`mpn_nand_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`and')')
+ifdef(`OPERATION_ior_n',`
+ define(`func',`mpn_ior_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_iorn_n',`
+ define(`func',`mpn_iorn_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_nior_n',`
+ define(`func',`mpn_nior_n')
+ define(`VARIANT_3')
+ define(`LOGOP',`or')')
+ifdef(`OPERATION_xor_n',`
+ define(`func',`mpn_xor_n')
+ define(`VARIANT_1')
+ define(`LOGOP',`xor')')
+ifdef(`OPERATION_xnor_n',`
+ define(`func',`mpn_xnor_n')
+ define(`VARIANT_2')
+ define(`LOGOP',`xor')')
+
+define(`addptr', `lea $1($2), $2')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+
+ifdef(`VARIANT_1',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+L(b00): mov 8(vp), %r9
+ LOGOP (up), %r8
+ LOGOP 8(up), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+L(e10): mov 24(vp), %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_2',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ not %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+ not %r8
+L(b00): mov 8(vp), %r9
+ not %r9
+ LOGOP (up), %r8
+ LOGOP 8(up), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+ not %r8
+L(e10): mov 24(vp), %r9
+ not %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_3',`
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (vp), %r8
+ mov R32(%rcx), R32(%rax)
+ and $3, R32(%rax)
+ je L(b00)
+ cmp $2, R32(%rax)
+ jc L(b01)
+ je L(b10)
+
+L(b11): LOGOP (up), %r8
+ not %r8
+ mov %r8, (rp)
+ inc n
+ addptr( -8, up)
+ addptr( -8, vp)
+ addptr( -8, rp)
+ jmp L(e11)
+L(b10): add $2, n
+ addptr( -16, up)
+ addptr( -16, vp)
+ addptr( -16, rp)
+ jmp L(e10)
+L(b01): LOGOP (up), %r8
+ not %r8
+ mov %r8, (rp)
+ dec n
+ jz L(ret)
+ addptr( 8, up)
+ addptr( 8, vp)
+ addptr( 8, rp)
+
+ ALIGN(16)
+L(top): mov (vp), %r8
+L(b00): mov 8(vp), %r9
+ LOGOP (up), %r8
+ not %r8
+ LOGOP 8(up), %r9
+ not %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+L(e11): mov 16(vp), %r8
+L(e10): mov 24(vp), %r9
+ addptr( 32, vp)
+ LOGOP 16(up), %r8
+ not %r8
+ LOGOP 24(up), %r9
+ addptr( 32, up)
+ not %r9
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ addptr( 32, rp)
+ sub $4, n
+ jnz L(top)
+
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
+')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshift.asm b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm
new file mode 100644
index 0000000..9016a71
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm
@@ -0,0 +1,145 @@
+dnl x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.32
+C Intel NHM 1.30 (drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea -8(up,n,8), up
+ lea 16(rp,n,8), rp
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %rax
+ mov -16(up), %r8
+ shr $2, n
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea -16(up,n,8), up
+ lea 8(rp,n,8), rp
+ mov 8(up), %r9
+ shld R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov (up), %r10
+ mov -8(up), %r11
+ jmp L(01)
+
+L(b10): lea -24(up,n,8), up
+ lea (rp,n,8), rp
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov (up), %r10
+ jmp L(10)
+
+ ALIGN(16)
+L(b11): lea -32(up,n,8), up
+ lea -8(rp,n,8), rp
+ mov 24(up), %r11
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shld R8(cnt), %r8, %r11
+ mov (up), %r10
+ mov %r11, (rp)
+L(10): shld R8(cnt), %r9, %r8
+ mov -8(up), %r11
+ mov %r8, -8(rp)
+L(01): shld R8(cnt), %r10, %r9
+ mov -16(up), %r8
+ mov %r9, -16(rp)
+L(00): shld R8(cnt), %r11, %r10
+ mov -24(up), %r9
+ add $-32, up
+ mov %r10, -24(rp)
+ add $-32, rp
+ dec n
+ jnz L(top)
+
+L(end): shld R8(cnt), %r8, %r11
+ mov %r11, (rp)
+L(2): shld R8(cnt), %r9, %r8
+ mov %r8, -8(rp)
+L(1): shl R8(cnt), %r9
+ mov %r9, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm
new file mode 100644
index 0000000..c428f13
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm
@@ -0,0 +1,159 @@
+dnl x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.52
+C Intel NHM 1.78 (just 2.15 for n < 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+C TODO
+C * This runs poorly on Nehalem compared to plain lshift, in particular for
+C n < 256.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea -8(up,n,8), up
+ lea 16(rp,n,8), rp
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %rax
+ mov -16(up), %r8
+ shr $2, n
+ shld R8(cnt), %r11, %r10
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea -16(up,n,8), up
+ lea 8(rp,n,8), rp
+ mov 8(up), %r9
+ shld R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov (up), %r10
+ mov -8(up), %r11
+ shld R8(cnt), %r10, %r9
+ jmp L(01)
+
+L(b10): lea -24(up,n,8), up
+ lea (rp,n,8), rp
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov (up), %r10
+ shld R8(cnt), %r9, %r8
+ jmp L(10)
+
+ ALIGN(16)
+L(b11): lea -32(up,n,8), up
+ lea -8(rp,n,8), rp
+ mov 24(up), %r11
+ mov 16(up), %r8
+ mov 8(up), %r9
+ shld R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shld R8(cnt), %r8, %r11
+ mov (up), %r10
+ not %r11
+ shld R8(cnt), %r9, %r8
+ mov %r11, (rp)
+L(10): mov -8(up), %r11
+ not %r8
+ shld R8(cnt), %r10, %r9
+ mov %r8, -8(rp)
+L(01): mov -16(up), %r8
+ not %r9
+ shld R8(cnt), %r11, %r10
+ mov %r9, -16(rp)
+L(00): mov -24(up), %r9
+ not %r10
+ add $-32, up
+ mov %r10, -24(rp)
+ add $-32, rp
+ dec n
+ jnz L(top)
+
+L(end): shld R8(cnt), %r8, %r11
+ not %r11
+ mov %r11, (rp)
+L(2): shld R8(cnt), %r9, %r8
+ not %r8
+ mov %r8, -8(rp)
+L(1): shl R8(cnt), %r9
+ not %r9
+ mov %r9, -16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm
new file mode 100644
index 0000000..d16be85
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm
@@ -0,0 +1,975 @@
+dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
+dnl It also seems good for Conroe/Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_1 mul_2 mul_3 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.0 4.0 - 4.18-4.25
+C Intel NHM 3.75 3.8 - 4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
+C | | | |
+C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
+C | / | / | / | /
+C | / | / | / | /
+C | / | / | / | /
+C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
+C _____ _____ _____ _____
+C / \ / \ / \ / \
+C \|/ | \|/ | \|/ | \|/ |
+C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
+C \ /|\ \ /|\ \ /|\ \ /|\
+C \_____/ \_____/ \_____/ \_____/
+
+C TODO
+C * Tune. None done so far.
+C * Currently 2687 bytes, making it smaller would be nice.
+C * Implement some basecases, say for un < 4.
+C * Try zeroing with xor in m2 loops.
+C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C between loop header and wind-down code.
+C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
+define(`vn_param', `%r8')
+
+define(`un', `%r9')
+define(`vn', `(%rsp)')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r12')
+define(`i', `%r13')
+define(`vp', `%r14')
+
+define(`X0', `%r8')
+define(`X1', `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ mov (up), %rax C shared for mul_1 and mul_2
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov (vp_param), v0 C shared for mul_1 and mul_2
+
+ xor un, un
+ sub un_param, un C un = -un_param
+
+ lea (up,un_param,8), up
+ lea (rp,un_param,8), rp
+
+ mul v0 C shared for mul_1 and mul_2
+
+ test $1, R8(vn_param)
+ jz L(m2)
+
+ lea 8(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(m1x1)
+
+L(m1x0):test $2, R8(un)
+ jnz L(m1s2)
+
+L(m1s0):
+ lea (un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ lea L(do_am0)(%rip), %rbp
+ jmp L(m1e0)
+
+L(m1s2):
+ lea 2(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mul v0
+ lea L(do_am2)(%rip), %rbp
+ test i, i
+ jnz L(m1e2)
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),8(rp,un,8))
+ mov %rdx, I((rp),16(rp,un,8))
+ jmp L(ret2)
+
+L(m1x1):test $2, R8(un)
+ jz L(m1s3)
+
+L(m1s1):
+ lea 1(un), i
+ mov %rax, (rp,un,8)
+ test i, i
+ jz L(1)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am1)(%rip), %rbp
+ jmp L(m1e1)
+L(1): mov %rdx, I((rp),8(rp,un,8))
+ jmp L(ret2)
+
+L(m1s3):
+ lea -1(un), i
+ mov %rax, (rp,un,8)
+ mov 8(up,un,8), %rax
+ mov %rdx, w1 C FIXME: Use lea?
+ lea L(do_am3)(%rip), %rbp
+ jmp L(m1e3)
+
+ ALIGNx
+L(m1top):
+ mul v0
+ mov w1, -16(rp,i,8)
+L(m1e2):xor R32(w1), R32(w1)
+ add %rax, w0
+ mov (up,i,8), %rax
+ adc %rdx, w1
+ mov w0, -8(rp,i,8)
+L(m1e1):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 8(up,i,8), %rax
+ adc %rdx, w0
+ mov w1, (rp,i,8)
+L(m1e0):xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w0
+ mov 16(up,i,8), %rax
+ adc %rdx, w1
+ mov w0, 8(rp,i,8)
+L(m1e3):xor R32(w0), R32(w0)
+ mul v0
+ add %rax, w1
+ mov 24(up,i,8), %rax
+ adc %rdx, w0
+ add $4, i
+ js L(m1top)
+
+ mul v0
+ mov w1, I(-16(rp),-16(rp,i,8))
+ add %rax, w0
+ adc $0, %rdx
+ mov w0, I(-8(rp),-8(rp,i,8))
+ mov %rdx, I((rp),(rp,i,8))
+
+ dec vn_param
+ jz L(ret2)
+ lea -8(rp), rp
+ jmp *%rbp
+
+L(m2):
+ mov 8(vp_param), v1
+ lea 16(vp_param), vp C FIXME: delay until known needed
+
+ test $1, R8(un)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(un)
+ jnz L(b10)
+
+L(b00): lea (un), i
+ mov %rax, (rp,un,8)
+ mov %rdx, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov $0, R32(w2)
+ jmp L(m2e0)
+
+L(b10): lea -2(un), i
+ mov %rax, w2 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w3 C FIXME: Use lea?
+ mov $0, R32(w0)
+ jmp L(m2e2)
+
+L(bx1): test $2, R8(un)
+ jz L(b11)
+
+L(b01): lea 1(un), i
+ mov %rax, (rp,un,8)
+ mov (up,un,8), %rax
+ mov %rdx, w0 C FIXME: Use lea?
+ mov $0, R32(w1)
+ jmp L(m2e1)
+
+L(b11): lea -1(un), i
+ mov %rax, w1 C FIXME: Use lea?
+ mov (up,un,8), %rax
+ mov %rdx, w2 C FIXME: Use lea?
+ mov $0, R32(w3)
+ jmp L(m2e3)
+
+ ALIGNx
+L(m2top0):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+L(m2e0):mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top0)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am0):
+ push %r15
+ push vn_param
+
+L(olo0):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+C lea 0(un), i
+ mov un, i
+ mul v0
+ mov %rax, X0
+ mov (up,un,8), %rax
+ MOV( %rdx, X1, 2)
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,un,8), w2
+ mov %rax, w3
+ jmp L(lo0)
+
+ ALIGNx
+L(am2top0):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top0)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo0)
+
+L(ret): pop %rax
+ pop %r15
+L(ret2):pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top1):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(m2e1):mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top1)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am1):
+ push %r15
+ push vn_param
+
+L(olo1):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea 1(un), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 128)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w1
+ mul v1
+ mov %rax, w2
+ mov 8(up,un,8), %rax
+ MOV( %rdx, w3, 1)
+ jmp L(lo1)
+
+ ALIGNx
+L(am2top1):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo1): mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top1)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo1)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top2):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(m2e2):mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top2)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am2):
+ push %r15
+ push vn_param
+
+L(olo2):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea -2(un), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w0
+ mul v1
+ mov %rax, w1
+ lea (%rdx), w2
+ mov 8(up,un,8), %rax
+ jmp L(lo2)
+
+ ALIGNx
+L(am2top2):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+L(lo2): mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top2)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo2)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+
+ ALIGNx
+L(m2top3):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+L(m2e3):mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top3)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, I((rp),(rp,i,8))
+ mov w1, I(8(rp),8(rp,i,8))
+
+ add $-2, vn_param
+ jz L(ret2)
+
+L(do_am3):
+ push %r15
+ push vn_param
+
+L(olo3):
+ mov (vp), v0
+ mov 8(vp), v1
+ lea 16(vp), vp
+ lea 16(rp), rp
+ mov (up,un,8), %rax
+ lea -1(un), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (up,un,8), %rax
+ mov (rp,un,8), w3
+ mul v1
+ mov %rax, w0
+ MOV( %rdx, w1, 16)
+ mov 8(up,un,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(am2top3):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+L(lo3): mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top3)
+
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add w2, X0
+ mov X0, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ addl $-2, vn
+ jnz L(olo3)
+
+ pop %rax
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm
new file mode 100644
index 0000000..0f03d86
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm
@@ -0,0 +1,427 @@
+dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.0 4.18-4.25
+C Intel NHM 3.75 4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Implement proper cor2, replacing current cor0.
+C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
+C * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param', `%rcx')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r12')
+define(`n', `%r9')
+define(`i', `%r13')
+define(`vp', `%r8')
+
+define(`X0', `%r14')
+define(`X1', `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+
+ mov (up), %rax
+ mov vp_param, vp
+
+ cmp $4, n_param
+ jb L(small)
+
+ mov (vp_param), v0
+ push %rbx
+ lea (rp,n_param,8), rp C point rp at R[un]
+ push %rbp
+ lea (up,n_param,8), up C point up right after U's end
+ push %r12
+ mov $0, R32(n) C FIXME
+ sub n_param, n
+ push %r13
+ mul v0
+ mov 8(vp), v1
+
+ test $1, R8(n_param)
+ jnz L(m2x1)
+
+L(m2x0):test $2, R8(n_param)
+ jnz L(m2b2)
+
+L(m2b0):lea (n), i
+ mov %rax, (rp,n,8)
+ mov %rdx, w1
+ mov (up,n,8), %rax
+ xor R32(w2), R32(w2)
+ jmp L(m2e0)
+
+L(m2b2):lea -2(n), i
+ mov %rax, w2
+ mov (up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ jmp L(m2e2)
+
+L(m2x1):test $2, R8(n_param)
+ jnz L(m2b3)
+
+L(m2b1):lea 1(n), i
+ mov %rax, (rp,n,8)
+ mov (up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(m2e1)
+
+L(m2b3):lea -1(n), i
+ xor R32(w3), R32(w3)
+ mov %rax, w1
+ mov %rdx, w2
+ mov (up,n,8), %rax
+ jmp L(m2e3)
+
+ ALIGNx
+L(m2tp):mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(m2e1):mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+L(m2e0):mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+L(m2e3):mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(m2e2):mul v1
+ mov $0, R32(w1) C FIXME: dead in last iteration
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0 C FIXME: dead in last iteration
+ add $4, i
+ js L(m2tp)
+
+L(m2ed):imul v0, %rax
+ add w3, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jge L(cor1)
+
+ push %r14
+ push %r15
+
+L(outer):
+ mov (vp), v0
+ mov 8(vp), v1
+ mov (up,n,8), %rax
+ mul v0
+ test $1, R8(n)
+ jnz L(a1x1)
+
+L(a1x0):mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (up,n,8), %rax
+ mul v1
+ test $2, R8(n)
+ jnz L(a110)
+
+L(a100):lea (n), i
+ mov (rp,n,8), w3
+ mov %rax, w0
+ MOV( %rdx, w1, 16)
+ jmp L(lo0)
+
+L(a110):lea 2(n), i
+ mov (rp,n,8), w1
+ mov %rax, w2
+ mov 8(up,n,8), %rax
+ MOV( %rdx, w3, 1)
+ jmp L(lo2)
+
+L(a1x1):mov %rax, X0
+ MOV( %rdx, X1, 2)
+ mov (up,n,8), %rax
+ mul v1
+ test $2, R8(n)
+ jz L(a111)
+
+L(a101):lea 1(n), i
+ MOV( %rdx, w0, 4)
+ mov (rp,n,8), w2
+ mov %rax, w3
+ jmp L(lo1)
+
+L(a111):lea -1(n), i
+ MOV( %rdx, w2, 64)
+ mov %rax, w1
+ mov (rp,n,8), w0
+ mov 8(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(top): mul v1
+ add w0, w1
+ adc %rax, w2
+ mov -8(up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo2): mul v0
+ add w1, X1
+ mov X1, -16(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov -8(up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov -8(rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo1): mov (up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, -8(rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov (up,i,8), %rax
+ mov (rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, (rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 8(rp,i,8), w3
+ adc $0, X1
+ mov 8(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 16(up,i,8), %rax
+ adc $0, w2
+L(lo3): mul v0
+ add w0, X0
+ mov X0, 8(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 16(up,i,8), %rax
+ mov 16(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(top)
+
+L(end): imul v1, %rax
+ add w0, w1
+ adc %rax, w2
+ mov I(-8(up),-8(up,i,8)), %rax
+ imul v0, %rax
+ add w1, X1
+ mov X1, I(-16(rp),-16(rp,i,8))
+ adc X0, %rax
+ mov I(-8(rp),-8(rp,i,8)), w1
+ add w1, w2
+ add w2, %rax
+ mov %rax, I(-8(rp),-8(rp,i,8))
+
+ add $2, n
+ lea 16(vp), vp
+ lea -16(up), up
+ cmp $-2, n
+ jl L(outer)
+
+ pop %r15
+ pop %r14
+
+ jnz L(cor0)
+
+L(cor1):mov (vp), v0
+ mov 8(vp), v1
+ mov -16(up), %rax
+ mul v0 C u0 x v2
+ add -16(rp), %rax C FIXME: rp[0] still available in reg?
+ adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
+ mov -8(up), %rbx
+ imul v0, %rbx
+ mov -16(up), %rcx
+ imul v1, %rcx
+ mov %rax, -16(rp)
+ add %rbx, %rcx
+ add %rdx, %rcx
+ mov %rcx, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(cor0):mov (vp), %r11
+ imul -8(up), %r11
+ add %rax, %r11
+ mov %r11, -8(rp)
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ cmp $2, n_param
+ jae L(gt1)
+L(n1): imul (vp_param), %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp_param), %r9
+ mul %r9
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp_param), %r9
+ mul %r9 C u0 x v0
+ mov %rax, (rp)
+ mov %rdx, %r10
+ mov 8(up), %rax
+ mul %r9 C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r11
+ mov (up), %rax
+ mul %r11 C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r11 C u1 x v1
+ add %r11, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/popcount.asm b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm
new file mode 100644
index 0000000..3de69d8
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm
@@ -0,0 +1,185 @@
+dnl AMD64 SSSE3 mpn_popcount -- population count.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.79-1.91 n
+C AMD bd2 1.73-1.85 n
+C AMD bd3 ?
+C AMD bd4 1.73-1.85 n
+C AMD zen 1.47 n
+C AMD bobcat 8.0 n
+C AMD jaguar 4.78 n
+C Intel P4 n/a
+C Intel CNR 3.75
+C Intel PNR 2.61 y
+C Intel NHM 2.03 n
+C Intel SBR 1.87 n
+C Intel IBR 1.52-1.58 n
+C Intel HWL 1.52-1.58 n
+C Intel BWL 1.52-1.58 n
+C Intel SKL 1.51 n
+C Intel atom 12.3 n
+C Intel SLM 9.1 n
+C VIA nano ?
+
+C TODO
+C * This was hand-written without too much thought about optimal insn
+C selection; check to see of it can be improved.
+C * Consider doing some instruction scheduling.
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+ `define(`OFF1',64) define(`OFF2',80)')
+ movdqa OFF1`'(%r9), %xmm7
+ movdqa OFF2`'(%r9), %xmm6
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm8, %xmm8
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): movq (up), %xmm1
+ add $8, up
+ jmp L(e1)
+
+L(2): add $-48, up
+ jmp L(e2)
+
+L(3): movq (up), %xmm1
+ add $-40, up
+ jmp L(e3)
+
+L(4): add $-32, up
+ jmp L(e4)
+
+L(5): movq (up), %xmm1
+ add $-24, up
+ jmp L(e5)
+
+L(6): add $-16, up
+ jmp L(e6)
+
+L(7): movq (up), %xmm1
+ add $-8, up
+ jmp L(e7)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm1
+L(e7): movdqa %xmm6, %xmm0 C copy mask register
+ movdqa %xmm7, %xmm2 C copy count register
+ movdqa %xmm7, %xmm3 C copy count register
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm1
+L(e5): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm1
+L(e3): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e2): lddqu 48(up), %xmm1
+ add $64, up
+L(e1): movdqa %xmm6, %xmm0
+ movdqa %xmm7, %xmm2
+ movdqa %xmm7, %xmm3
+ pand %xmm1, %xmm0
+ psrlw $4, %xmm1
+ pand %xmm6, %xmm1
+ pshufb %xmm0, %xmm2
+ pshufb %xmm1, %xmm3
+ psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts
+ paddb %xmm2, %xmm3
+ paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts
+ movdqa %xmm3, %xmm4
+ sub $8, n
+ jg L(top)
+
+ psadbw %xmm5, %xmm4
+ paddq %xmm4, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
diff --git a/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm
new file mode 100644
index 0000000..8c296fd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm
@@ -0,0 +1,430 @@
+dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core 4.5 (fluctuating)
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * Consider inlining mpn_add_n.
+C * Single basecases out before the pushes.
+C * Keep up[i] in registers for basecases (might require pushes).
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r14')
+define(`j', `%r15')
+define(`mp', `%r12')
+define(`q0', `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C X q0' n X rp up u0i mp q0 i j
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), q0
+ mov n, j C outer loop induction var
+ lea (mp_param,n,8), mp
+ lea -16(up,n,8), up
+ neg n
+ imul u0inv, q0 C first iteration q0
+
+ test $1, R8(n)
+ jz L(b0)
+
+L(b1): cmp $-1, R32(n)
+ jz L(n1)
+ cmp $-3, R32(n)
+ jz L(n3)
+
+ push rp
+
+L(otp1):lea 3(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ lea (%rax), %rbp
+ mov 8(mp,n,8), %rax
+ lea (%rdx), %r9
+ mul q0
+ lea (%rax), %r11
+ mov 16(mp,n,8), %rax
+ mov 16(up,n,8), %r10
+ lea (%rdx), %rdi
+ mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov 24(mp,n,8), %rax
+ adc %r9, %r11
+ mov 24(up,n,8), %rbx
+ lea (%rdx), %r9
+ adc $0, %rdi
+ mul q0
+ add %r11, %rbx
+ lea (%rax), %r11
+ mov 32(mp,n,8), %rax
+ adc %rdi, %rbp
+ mov %rbx, 24(up,n,8)
+ mov 32(up,n,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ imul u0inv, %rbx C next q limb
+ add $2, i
+ jns L(ed1)
+
+ ALIGNx
+L(tp1): mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(up,i,8)
+ mov (up,i,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 8(mp,i,8), %rax
+ adc %rdi, %rbp
+ mov %r10, (up,i,8)
+ mov 8(up,i,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ add $2, i
+ js L(tp1)
+
+L(ed1): mul q0
+ add %rbp, %r10
+ adc %r9, %r11
+ mov %r10, I(-8(up),-8(up,i,8))
+ mov I((up),(up,i,8)), %r10
+ adc $0, %rdi
+ add %r11, %r10
+ adc %rdi, %rax
+ mov %r10, I((up),(up,i,8))
+ mov I(8(up),8(up,i,8)), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, I(8(up),8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, 16(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp1)
+ jmp L(cj)
+
+L(b0): cmp $-2, R32(n)
+ jz L(n2)
+ cmp $-4, R32(n)
+ jz L(n4)
+
+ push rp
+
+L(otp0):lea 4(n), i
+ mov (mp,n,8), %rax
+ mul q0
+ lea (%rax), %r11
+ mov 8(mp,n,8), %rax
+ lea (%rdx), %rdi
+ mul q0
+ lea (%rax), %rbp
+ mov 16(mp,n,8), %rax
+ mov 16(up,n,8), %r10
+ lea (%rdx), %r9
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 24(mp,n,8), %rax
+ adc %rdi, %rbp
+ mov 24(up,n,8), %rbx
+ lea (%rdx), %rdi
+ adc $0, %r9
+ mul q0
+ add %rbp, %rbx
+ lea (%rax), %rbp
+ mov 32(mp,n,8), %rax
+ adc %r9, %r11
+ mov %rbx, 24(up,n,8)
+ mov 32(up,n,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+ imul u0inv, %rbx C next q limb
+ jmp L(e0)
+
+ ALIGNx
+L(tp0): mul q0
+ add %rbp, %r10
+ lea (%rax), %rbp
+ mov (mp,i,8), %rax
+ adc %r9, %r11
+ mov %r10, -8(up,i,8)
+ mov (up,i,8), %r10
+ lea (%rdx), %r9
+ adc $0, %rdi
+L(e0): mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov 8(mp,i,8), %rax
+ adc %rdi, %rbp
+ mov %r10, (up,i,8)
+ mov 8(up,i,8), %r10
+ lea (%rdx), %rdi
+ adc $0, %r9
+ add $2, i
+ js L(tp0)
+
+L(ed0): mul q0
+ add %rbp, %r10
+ adc %r9, %r11
+ mov %r10, I(-8(up),-8(up,i,8))
+ mov I((up),(up,i,8)), %r10
+ adc $0, %rdi
+ add %r11, %r10
+ adc %rdi, %rax
+ mov %r10, I((up),(up,i,8))
+ mov I(8(up),8(up,i,8)), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, I(8(up),8(up,i,8))
+ adc $0, %rdx
+ mov %rdx, 16(up,n,8) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(otp0)
+
+L(cj): lea 16(up), up C FIXME
+ pop rp
+L(add_n):
+IFSTD(` lea (up,n,8), up C param 2: up
+ lea (up,n,8), %rdx C param 3: up - n
+ neg R32(n) ') C param 4: n
+
+IFDOS(` lea (up,n,8), %rdx C param 2: up
+ lea (%rdx,n,8), %r8 C param 3: up - n
+ neg R32(n)
+ mov n, %r9 C param 4: n
+ mov rp, %rcx ') C param 1: rp
+
+IFSTD(` sub $8, %rsp ')
+IFDOS(` sub $40, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_add_n)
+IFSTD(` add $8, %rsp ')
+IFDOS(` add $40, %rsp ')
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(n1): mov (mp_param), %rax
+ mul q0
+ add 8(up), %rax
+ adc 16(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+L(n2): mov (mp_param), %rax
+ mov (up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov 8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, q0
+ imul u0inv, q0 C next q0
+ mov -16(mp), %rax
+ mul q0
+ add %rax, %r10
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ mov 16(up), %r14
+ mul q0
+ add %rax, %r14
+ adc $0, %rdx
+ add %r9, %r14
+ adc $0, %rdx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 24(up), %rdx
+ mov %r14, (rp)
+ mov %rdx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n3): mov -24(mp), %rax
+ mov -8(up), %r10
+ mul q0
+ add %rax, %r10
+ mov -16(mp), %rax
+ mov %rdx, %r11
+ adc $0, %r11
+ mov (up), %rbp
+ mul q0
+ add %rax, %rbp
+ mov %rdx, %r9
+ adc $0, %r9
+ mov -8(mp), %rax
+ add %r11, %rbp
+ mov 8(up), %r10
+ adc $0, %r9
+ mul q0
+ mov %rbp, q0
+ imul u0inv, q0 C next q0
+ add %rax, %r10
+ mov %rdx, %r11
+ adc $0, %r11
+ mov %rbp, (up)
+ add %r9, %r10
+ adc $0, %r11
+ mov %r10, 8(up)
+ mov %r11, -8(up) C up[0]
+ lea 8(up), up C up++
+ dec j
+ jnz L(n3)
+
+ mov -32(up), %rdx
+ mov -24(up), %rbx
+ xor R32(%rax), R32(%rax)
+ add %rbp, %rdx
+ adc %r10, %rbx
+ adc 8(up), %r11
+ mov %rdx, (rp)
+ mov %rbx, 8(rp)
+ mov %r11, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+ ALIGNx
+L(n4): mov -32(mp), %rax
+ mul q0
+ lea (%rax), %r11
+ mov -24(mp), %rax
+ lea (%rdx), %r14
+ mul q0
+ lea (%rax), %rbp
+ mov -16(mp), %rax
+ mov -16(up), %r10
+ lea (%rdx), %r9
+ mul q0
+ add %r11, %r10
+ lea (%rax), %r11
+ mov -8(mp), %rax
+ adc %r14, %rbp
+ mov -8(up), %rbx
+ lea (%rdx), %r14
+ adc $0, %r9
+ mul q0
+ add %rbp, %rbx
+ adc %r9, %r11
+ mov %rbx, -8(up)
+ mov (up), %r10
+ adc $0, %r14
+ imul u0inv, %rbx C next q limb
+ add %r11, %r10
+ adc %r14, %rax
+ mov %r10, (up)
+ mov 8(up), %r10
+ adc $0, %rdx
+ add %rax, %r10
+ mov %r10, 8(up)
+ adc $0, %rdx
+ mov %rdx, -16(up) C up[0]
+ mov %rbx, q0 C previously computed q limb -> q0
+ lea 8(up), up C up++
+ dec j
+ jnz L(n4)
+ lea 16(up), up
+ jmp L(add_n)
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm
new file mode 100644
index 0000000..27eed37
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm
@@ -0,0 +1,169 @@
+dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
+
+dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3.05
+C Intel NHM 3.3
+C Intel SBR 2.5
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Loopmix to approach 2.5 c/l on NHM.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(ADDSUB, add)
+ define(ADCSBB, adc)
+ define(func_n, mpn_rsh1add_n)
+ define(func_nc, mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(ADDSUB, sub)
+ define(ADCSBB, sbb)
+ define(func_n, mpn_rsh1sub_n)
+ define(func_nc, mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func_nc)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbx
+ push %rbp
+
+ neg %r8 C set C flag from parameter
+ mov (up), %r8
+ ADCSBB (vp), %r8
+ jmp L(ent)
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(func_n)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %rbp
+
+ mov (up), %r8
+ ADDSUB (vp), %r8
+L(ent): sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %rax
+ and $1, R32(%rax) C return value
+
+ lea (up,n,8), up
+ lea (vp,n,8), vp
+ lea (rp,n,8), rp
+ mov R32(n), R32(%rbp)
+ neg n
+ and $3, R32(%rbp)
+ jz L(b0)
+ cmp $2, R32(%rbp)
+ jae L(n1)
+
+L(b1): mov %r8, %rbp
+ inc n
+ js L(top)
+ jmp L(end)
+
+L(n1): jnz L(b3)
+ add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %r10
+ add $-2, n
+ jmp L(2)
+
+L(b3): add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r10
+ mov 16(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r10
+ ADCSBB 16(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ mov %r8, %r9
+ dec n
+ jmp L(3)
+
+L(b0): add R32(%rbx), R32(%rbx) C restore cy
+ mov 8(up,n,8), %r9
+ mov 16(up,n,8), %r10
+ mov 24(up,n,8), %r11
+ ADCSBB 8(vp,n,8), %r9
+ ADCSBB 16(vp,n,8), %r10
+ ADCSBB 24(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ jmp L(4)
+
+ ALIGN(16)
+
+L(top): add R32(%rbx), R32(%rbx) C restore cy
+ mov (up,n,8), %r8
+ mov 8(up,n,8), %r9
+ mov 16(up,n,8), %r10
+ mov 24(up,n,8), %r11
+ ADCSBB (vp,n,8), %r8
+ ADCSBB 8(vp,n,8), %r9
+ ADCSBB 16(vp,n,8), %r10
+ ADCSBB 24(vp,n,8), %r11
+ sbb R32(%rbx), R32(%rbx) C save cy
+ shrd $1, %r8, %rbp
+ mov %rbp, -8(rp,n,8)
+L(4): shrd $1, %r9, %r8
+ mov %r8, (rp,n,8)
+L(3): shrd $1, %r10, %r9
+ mov %r9, 8(rp,n,8)
+L(2): shrd $1, %r11, %r10
+ mov %r10, 16(rp,n,8)
+L(1): add $4, n
+ mov %r11, %rbp
+ js L(top)
+
+L(end): shrd $1, %rbx, %rbp
+ mov %rbp, -8(rp)
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/rshift.asm b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm
new file mode 100644
index 0000000..7578a53
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm
@@ -0,0 +1,143 @@
+dnl x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem.
+
+dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2 1.32
+C Intel NHM 1.30 (drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+
+ xor R32(%rax), R32(%rax)
+
+ test $1, R8(n)
+ jnz L(bx1)
+L(bx0): test $2, R8(n)
+ jnz L(b10)
+
+L(b00): lea 8(up), up
+ lea -24(rp), rp
+ mov -8(up), %r10
+ mov (up), %r11
+ shrd R8(cnt), %r10, %rax
+ mov 8(up), %r8
+ shr $2, n
+ jmp L(00)
+
+L(bx1): test $2, R8(n)
+ jnz L(b11)
+
+L(b01): lea 16(up), up
+ lea -16(rp), rp
+ mov -16(up), %r9
+ shrd R8(cnt), %r9, %rax
+ shr $2, n
+ jz L(1)
+ mov -8(up), %r10
+ mov (up), %r11
+ jmp L(01)
+
+L(b10): lea 24(up), up
+ lea -8(rp), rp
+ mov -24(up), %r8
+ mov -16(up), %r9
+ shrd R8(cnt), %r8, %rax
+ shr $2, n
+ jz L(2)
+ mov -8(up), %r10
+ jmp L(10)
+
+L(b11): lea 32(up), up
+ mov -32(up), %r11
+ mov -24(up), %r8
+ mov -16(up), %r9
+ shrd R8(cnt), %r11, %rax
+ shr $2, n
+ jz L(end)
+
+ ALIGN(16)
+L(top): shrd R8(cnt), %r8, %r11
+ mov -8(up), %r10
+ mov %r11, (rp)
+L(10): shrd R8(cnt), %r9, %r8
+ mov (up), %r11
+ mov %r8, 8(rp)
+L(01): shrd R8(cnt), %r10, %r9
+ mov 8(up), %r8
+ mov %r9, 16(rp)
+L(00): shrd R8(cnt), %r11, %r10
+ mov 16(up), %r9
+ add $32, up
+ mov %r10, 24(rp)
+ add $32, rp
+ dec n
+ jnz L(top)
+
+L(end): shrd R8(cnt), %r8, %r11
+ mov %r11, (rp)
+L(2): shrd R8(cnt), %r9, %r8
+ mov %r8, 8(rp)
+L(1): shr R8(cnt), %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sec_tabselect.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm
new file mode 100644
index 0000000..a112c1b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm
@@ -0,0 +1,984 @@
+dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
+dnl It also seems good for Conroe/Wolfdale.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core 4.9 4.18-4.25 3.87
+C Intel NHM 3.8 4.06-4.2 3.5
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
+C | | | |
+C | | | |
+C | | | |
+C \|/ \|/ \|/ \|/
+C ____________ ____________
+C / \ / \
+C \|/ \ \|/ \
+C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
+C \ /|\ \ /|\
+C \____________/ \____________/
+C \ /
+C \ /
+C \ /
+C tail(0m2) tail(1m2)
+C \ /
+C \ /
+C sqr_diag_addlsh1
+
+C TODO
+C * Tune. None done so far.
+C * Currently 2761 bytes, making it smaller would be nice.
+C * Consider using a jumptab-based entry sequence. One might even use a mask-
+C less sequence, if the table is large enough to support tuneup's needs.
+C The code would be, using non-PIC code,
+C lea tab(%rip),%rax; jmp *(n,%rax)
+C or,
+C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
+C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
+C with the last four entries repeated a safe number of times.
+C * Consider expanding feed-in code in order to avoid zeroing registers.
+C * Zero consistently with xor.
+C * Check if using "lea (reg),reg" should be done in more places; we have some
+C explicit "mov %rax,reg" now.
+C * Try zeroing with xor in m2 loops.
+C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C between loop header and wind-down code.
+C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+
+define(`n', `%r8')
+
+define(`v0', `%r10')
+define(`v1', `%r11')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r9')
+define(`i', `%r13')
+
+define(`X0', `%r12')
+define(`X1', `%r14')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $4, n_param
+ jl L(small)
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+
+ mov (up), v0
+ mov 8(up), %rax
+ mov %rax, v1
+
+ mov $1, R32(n)
+ sub n_param, n C n = -n_param+1
+ push n
+
+ lea (up,n_param,8), up
+ lea (rp,n_param,8), rp
+
+ mul v0
+
+ test $1, R8(n)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(n)
+ mov %rax, (rp,n,8)
+ jnz L(b10)
+
+L(b00): lea (n), i C n = 5, 9, ...
+ mov %rdx, w1 C FIXME: Use lea?
+ xor R32(w2), R32(w2)
+ jmp L(m2e0)
+
+L(b10): lea 2(n), i C n = 7, 11, ...
+ mov 8(up,n,8), %rax
+ mov %rdx, w3 C FIXME: Use lea?
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ jmp L(m2e2)
+
+L(bx1): test $2, R8(n)
+ mov %rax, (rp,n,8)
+ jz L(b11)
+
+L(b01): lea 1(n), i C n = 6, 10, ...
+ mov %rdx, w0 C FIXME: Use lea?
+ xor R32(w1), R32(w1)
+ jmp L(m2e1)
+
+L(b11): lea -1(n), i C n = 4, 8, 12, ...
+ mov %rdx, w2 C FIXME: Use lea?
+ xor R32(w3), R32(w3)
+ jmp L(m2e3)
+
+
+ ALIGNx
+L(m2top1):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+L(m2e1):mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top1)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ jmp L(am2o3)
+
+ ALIGNx
+L(m2top3):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+L(m2e3):mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top3)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ cmp $-1, n
+ jz L(cor1) C jumps iff entry n = 4
+
+L(am2o1):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea 1(n), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 128)
+ mov (rp,n,8), w1
+ xor R32(w2), R32(w2)
+ mov 8(up,n,8), %rax
+ xor R32(w3), R32(w3)
+ jmp L(lo1)
+
+ ALIGNx
+L(am2top1):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+L(lo1): mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top1)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+
+L(am2o3):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea -1(n), i
+ mul v0
+ mov %rax, X1
+ MOV( %rdx, X0, 8)
+ mov (rp,n,8), w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov 8(up,n,8), %rax
+ jmp L(lo3)
+
+ ALIGNx
+L(am2top3):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+L(lo3): mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top3)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+ cmp $-1, n
+ jnz L(am2o1)
+
+L(cor1):pop n
+ mov %rdx, w3
+ mov -16(up), v0
+ mov -8(up), %rax
+ mul v0
+ add w3, %rax
+ adc $0, %rdx
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ jmp L(sqr_diag_addlsh1)
+
+ ALIGNx
+L(m2top2):
+L(m2e2):mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+ mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top2)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ jmp L(am2o0)
+
+ ALIGNx
+L(m2top0):
+ mul v0
+ add %rax, w3
+ mov -8(up,i,8), %rax
+ mov w3, -8(rp,i,8)
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mov (up,i,8), %rax
+ mul v0
+ add %rax, w0
+ mov w0, (rp,i,8)
+ adc %rdx, w1
+ mov (up,i,8), %rax
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ adc %rdx, w2
+L(m2e0):mov 8(up,i,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 8(up,i,8), %rax
+ mul v1
+ add %rax, w2
+ mov w1, 8(rp,i,8)
+ adc %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,i,8), %rax
+ mul v0
+ add %rax, w2
+ mov 16(up,i,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ mov $0, R32(w1)
+ add %rax, w3
+ mov 24(up,i,8), %rax
+ mov w2, 16(rp,i,8)
+ adc %rdx, w0
+ add $4, i
+ js L(m2top0)
+
+ mul v0
+ add %rax, w3
+ mov I(-8(up),-8(up,i,8)), %rax
+ mov w3, I(-8(rp),-8(rp,i,8))
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w0, %rax
+ adc w1, %rdx
+ mov %rax, I((rp),(rp,i,8))
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n C decrease |n|
+ cmp $-2, n
+ jz L(cor2) C jumps iff entry n = 5
+
+L(am2o2):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea -2(n), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov (rp,n,8), w0
+ xor R32(w1), R32(w1)
+ xor R32(w2), R32(w2)
+ mov 8(up,n,8), %rax
+ jmp L(lo2)
+
+ ALIGNx
+L(am2top2):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+ mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+L(lo2): mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top2)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+
+L(am2o0):
+ mov -8(up,n,8), v0
+ mov (up,n,8), %rax
+ mov %rax, v1
+ lea 0(n), i
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 2)
+ xor R32(w0), R32(w0)
+ mov (rp,n,8), w2
+ xor R32(w3), R32(w3)
+ jmp L(lo0)
+
+ ALIGNx
+L(am2top0):
+ mul v1
+ add w0, w1
+ adc %rax, w2
+ mov (up,i,8), %rax
+ MOV( %rdx, w3, 1)
+ adc $0, w3
+ mul v0
+ add w1, X1
+ mov X1, -8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 2)
+ adc $0, X1
+ mov (up,i,8), %rax
+ mul v1
+ MOV( %rdx, w0, 4)
+ mov (rp,i,8), w1
+ add w1, w2
+ adc %rax, w3
+ adc $0, w0
+L(lo0): mov 8(up,i,8), %rax
+ mul v0
+ add w2, X0
+ adc %rax, X1
+ mov X0, (rp,i,8)
+ MOV( %rdx, X0, 8)
+ adc $0, X0
+ mov 8(up,i,8), %rax
+ mov 8(rp,i,8), w2
+ mul v1
+ add w2, w3
+ adc %rax, w0
+ MOV( %rdx, w1, 16)
+ adc $0, w1
+ mov 16(up,i,8), %rax
+ mul v0
+ add w3, X1
+ mov X1, 8(rp,i,8)
+ adc %rax, X0
+ MOV( %rdx, X1, 32)
+ mov 16(rp,i,8), w3
+ adc $0, X1
+ mov 16(up,i,8), %rax
+ mul v1
+ add w3, w0
+ MOV( %rdx, w2, 64)
+ adc %rax, w1
+ mov 24(up,i,8), %rax
+ adc $0, w2
+ mul v0
+ add w0, X0
+ mov X0, 16(rp,i,8)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov 24(up,i,8), %rax
+ mov 24(rp,i,8), w0
+ adc $0, X0
+ add $4, i
+ jnc L(am2top0)
+
+ mul v1
+ add w0, w1
+ adc w2, %rax
+ adc Z(i,$0), %rdx
+ add w1, X1
+ adc Z(i,$0), X0
+ mov X1, I(-8(rp),-8(rp,i,8))
+ add X0, %rax
+ mov %rax, I((rp),(rp,i,8))
+ adc Z(i,$0), %rdx
+ mov %rdx, I(8(rp),8(rp,i,8))
+
+ lea 16(rp), rp
+ add $2, n
+ cmp $-2, n
+ jnz L(am2o2)
+
+L(cor2):pop n
+ mov -24(up), v0
+ mov %rax, w2
+ mov %rdx, w0
+ mov -16(up), %rax
+ mov %rax, v1
+ mul v0
+ mov %rax, X0
+ MOV( %rdx, X1, 32)
+ mov -8(up), %rax
+ mul v0
+ add w2, X0
+ mov X0, -16(rp)
+ MOV( %rdx, X0, 128)
+ adc %rax, X1
+ mov -8(up), %rax
+ adc $0, X0
+ mul v1
+ add w0, X1
+ adc $0, X0
+ mov X1, -8(rp)
+ add X0, %rax
+ mov %rax, (rp)
+ adc $0, %rdx
+ mov %rdx, 8(rp)
+ lea 8(rp), rp
+
+L(sqr_diag_addlsh1):
+ mov -8(up,n,8), %rax
+ shl n
+ xor R32(%rbx), R32(%rbx)
+ mul %rax
+ mov 8(rp,n,8), %r11
+ lea (%rdx), %r10
+ mov 16(rp,n,8), %r9
+ add %r11, %r11
+ jmp L(dm)
+
+ ALIGNx
+L(dtop):mul %rax
+ add %r11, %r10
+ mov 8(rp,n,8), %r11
+ mov %r10, -8(rp,n,8)
+ adc %r9, %rax
+ lea (%rdx,%rbx), %r10
+ mov 16(rp,n,8), %r9
+ adc %r11, %r11
+L(dm): mov %rax, (rp,n,8)
+ mov (up,n,4), %rax
+ adc %r9, %r9
+ setc R8(%rbx)
+ add $2, n
+ js L(dtop)
+
+ mul %rax
+ add %r11, %r10
+ mov %r10, -8(rp)
+ adc %r9, %rax
+ lea (%rdx,%rbx), %r10
+ mov %rax, (rp)
+ adc $0, %r10
+ mov %r10, 8(rp)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(small):
+ mov (up), %rax
+ cmp $2, n_param
+ jae L(gt1)
+L(n1):
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+L(n2): mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2):
+L(n3): mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm
new file mode 100644
index 0000000..46488fc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm
@@ -0,0 +1,47 @@
+dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+define(ADDSUB, sub)
+define(ADCSBB, sbb)
+define(func, mpn_sublsh1_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm
new file mode 100644
index 0000000..f3b1e28
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm
@@ -0,0 +1,47 @@
+dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+define(ADDSUB, sub)
+define(ADCSBB, sbb)
+define(func, mpn_sublsh2_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm
new file mode 100644
index 0000000..272700d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm
@@ -0,0 +1,158 @@
+dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and
+dnl Core iN.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+C cycles/limb
+C AMD K8,K9 4.25
+C AMD K10 ?
+C Intel P4 ?
+C Intel core2 3
+C Intel NHM 3.1
+C Intel SBR 2.47
+C Intel atom ?
+C VIA nano ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(8)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ push %rbx
+ push %r12
+
+ mov R32(%rcx), R32(%rax)
+ lea 24(up,n,8), up
+ lea 24(vp,n,8), vp
+ lea 24(rp,n,8), rp
+ neg n
+
+ xor R32(%r11), R32(%r11)
+
+ mov -24(vp,n,8), %r8 C do first limb early
+ shrd $RSH, %r8, %r11
+
+ and $3, R32(%rax)
+ je L(b0)
+ cmp $2, R32(%rax)
+ jc L(b1)
+ je L(b2)
+
+L(b3): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -8(vp,n,8), %r10
+ shrd $RSH, %r10, %r9
+ mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+ mov -8(up,n,8), %r12
+ ADCSBB %r9, %r12
+ mov %r12, -8(rp,n,8)
+ mov %r10, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ add $3, n
+ js L(top)
+ jmp L(end)
+
+L(b1): mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov %r8, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ inc n
+ js L(top)
+ jmp L(end)
+
+L(b2): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -24(up,n,8), %r12
+ ADDSUB %r11, %r12
+ mov %r12, -24(rp,n,8)
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+ mov %r9, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+ add $2, n
+ js L(top)
+ jmp L(end)
+
+ ALIGN(16)
+L(top): mov -24(vp,n,8), %r8
+ shrd $RSH, %r8, %r11
+L(b0): mov -16(vp,n,8), %r9
+ shrd $RSH, %r9, %r8
+ mov -8(vp,n,8), %r10
+ shrd $RSH, %r10, %r9
+ mov (vp,n,8), %rbx
+ shrd $RSH, %rbx, %r10
+
+ add R32(%rax), R32(%rax) C restore cy
+
+ mov -24(up,n,8), %r12
+ ADCSBB %r11, %r12
+ mov %r12, -24(rp,n,8)
+
+ mov -16(up,n,8), %r12
+ ADCSBB %r8, %r12
+ mov %r12, -16(rp,n,8)
+
+ mov -8(up,n,8), %r12
+ ADCSBB %r9, %r12
+ mov %r12, -8(rp,n,8)
+
+ mov (up,n,8), %r12
+ ADCSBB %r10, %r12
+ mov %r12, (rp,n,8)
+
+ mov %rbx, %r11
+ sbb R32(%rax), R32(%rax) C save cy
+
+ add $4, n
+ js L(top)
+
+L(end): shr $RSH, %r11
+ pop %r12
+ pop %rbx
+ sub R32(%r11), R32(%rax)
+ neg R32(%rax)
+ FUNC_EXIT()
+ ret
+EPILOGUE()