aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/zen
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/zen')
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm227
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm165
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/com.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/copyd.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/copyi.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h280
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/hamdist.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/lshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mul_1.asm161
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm455
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm299
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/popcount.asm38
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/rshift.asm37
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm507
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm482
-rw-r--r--gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm37
20 files changed, 3022 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm
new file mode 100644
index 0000000..803fa30
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm
new file mode 100644
index 0000000..417dd0a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm
@@ -0,0 +1,227 @@
+dnl AMD64 mpn_addlsh_n, mpn_rsblsh_n.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 n/a
+C AMD bd3 n/a
+C AMD bd4 2.31
+C AMD zn1 1.69
+C AMD zn2 1.55
+C AMD zn3 1.36
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL 2.08
+C Intel BWL 1.78
+C Intel SKL 1.78
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * Perhaps avoid using jrcxz by using dec n + jnz.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n', `%rcx')
+define(`cnt', `%r8')
+
+define(`tnc', `%r9')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ mov (vp), %r10
+
+ mov R32(n), R32(%rax)
+ shr $3, n
+ xor R32(tnc), R32(tnc)
+ sub cnt, tnc
+ and $7, R32(%rax)
+
+ lea L(tab)(%rip), %r11
+ifdef(`PIC',`
+ movslq (%r11,%rax,4), %rax
+ add %r11, %rax
+ jmp *%rax
+',`
+ jmp *(%r11,%rax,8)
+')
+
+L(0): lea 32(up), up
+ lea 32(vp), vp
+ lea 32(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e0)
+
+L(7): mov %r10, %r11
+ lea 24(up), up
+ lea 24(vp), vp
+ lea 24(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e7)
+
+L(6): lea 16(up), up
+ lea 16(vp), vp
+ lea 16(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e6)
+
+L(5): mov %r10, %r11
+ lea 8(up), up
+ lea 8(vp), vp
+ lea 8(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e5)
+
+L(end): ADCSBB 24(up), %rax
+ mov %rax, -40(rp)
+ shrx( tnc, %r11, %rax)
+ ADCSBB n, %rax
+ FUNC_EXIT()
+ ret
+
+ ALIGN(32)
+L(top): jrcxz L(end)
+ mov -32(vp), %r10
+ ADCSBB 24(up), %rax
+ lea 64(up), up
+ shrx( tnc, %r11, %r11)
+ mov %rax, -40(rp)
+L(e0): dec n
+ shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov -24(vp), %r11
+ ADCSBB -32(up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, -32(rp)
+L(e7): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov -16(vp), %r10
+ ADCSBB -24(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, -24(rp)
+L(e6): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov -8(vp), %r11
+ ADCSBB -16(up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, -16(rp)
+L(e5): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov (vp), %r10
+ ADCSBB -8(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, -8(rp)
+L(e4): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov 8(vp), %r11
+ ADCSBB (up), %rax
+ shrx( tnc, %r10, %r10)
+ mov %rax, (rp)
+L(e3): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ mov 16(vp), %r10
+ ADCSBB 8(up), %rax
+ shrx( tnc, %r11, %r11)
+ mov %rax, 8(rp)
+L(e2): shlx( cnt, %r10, %rax)
+ lea (%r11,%rax), %rax
+ mov 24(vp), %r11
+ ADCSBB 16(up), %rax
+ lea 64(vp), vp
+ shrx( tnc, %r10, %r10)
+ mov %rax, 16(rp)
+ lea 64(rp), rp
+L(e1): shlx( cnt, %r11, %rax)
+ lea (%r10,%rax), %rax
+ jmp L(top)
+
+L(4): xor R32(%r11), R32(%r11)
+ jmp L(e4)
+
+L(3): mov %r10, %r11
+ lea -8(up), up
+ lea -8(vp), vp
+ lea -8(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e3)
+
+L(2): lea -16(up), up
+ lea -16(vp), vp
+ lea -16(rp), rp
+ xor R32(%r11), R32(%r11)
+ jmp L(e2)
+
+L(1): mov %r10, %r11
+ lea -24(up), up
+ lea 40(vp), vp
+ lea 40(rp), rp
+ xor R32(%r10), R32(%r10)
+ jmp L(e1)
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(4), L(tab))
+ JMPENT( L(5), L(tab))
+ JMPENT( L(6), L(tab))
+ JMPENT( L(7), L(tab))
diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm
new file mode 100644
index 0000000..89795e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm
@@ -0,0 +1,165 @@
+dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 4.3
+C AMD zen 2
+C AMD bt1 -
+C AMD bt2 -
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+ define(`ADDSUB', `add')
+ define(`ADCSBB', `adc')
+ define(`func', `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+ define(`ADDSUB', `sub')
+ define(`ADCSBB', `sbb')
+ define(`func', `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+ mov (up), %r8
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mulx( %r8, %rbx, %rax)
+ sub $-1, n
+ jz L(wd1)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo1)
+
+L(b0): mulx( %r8, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ xor R32(%rax), R32(%rax)
+ jmp L(lo0)
+
+L(b3): mulx( %r8, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ sub $-3, n
+ jz L(wd3)
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo3)
+
+L(b2): mulx( %r8, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
+ add %r12, %rbx
+ adc $0, %rax
+ sub $-2, n
+ jz L(wd2)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ test R32(%rax), R32(%rax) C clear cy
+ jmp L(lo2)
+
+L(top): ADDSUB %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ ADCSBB %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ ADCSBB %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ ADCSBB %rbx, 24(rp,n,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax C rax = carry limb
+ add $4, n
+ js L(top)
+
+L(end): ADDSUB %r9, (rp)
+L(wd3): ADCSBB %r11, 8(rp)
+L(wd2): ADCSBB %r13, 16(rp)
+L(wd1): ADCSBB %rbx, 24(rp)
+ adc n, %rax
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/com.asm b/gmp-6.3.0/mpn/x86_64/zen/com.asm
new file mode 100644
index 0000000..b34f841
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/com.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_com optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyd.asm b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm
new file mode 100644
index 0000000..63ed237
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyd optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyi.asm b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm
new file mode 100644
index 0000000..1aafaaa
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_copyi optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm
new file mode 100644
index 0000000..0ffb6ca
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_11.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/bd2/gcd_11.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm
new file mode 100644
index 0000000..5dfd9e3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm
@@ -0,0 +1,37 @@
+dnl AMD64 mpn_gcd_22.
+
+dnl Copyright 2019 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/coreihwl/gcd_22.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h
new file mode 100644
index 0000000..05a12b3
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h
@@ -0,0 +1,280 @@
+/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions. FIXME: We should disable lib inclusion. */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 468,514,360 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 32
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 22
+
+#define DIV_1_VS_MUL_1_PERCENT 338
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 107
+#define MUL_TOOM44_THRESHOLD 190
+#define MUL_TOOM6H_THRESHOLD 230
+#define MUL_TOOM8H_THRESHOLD 272
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 106
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 117
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 32
+#define SQR_TOOM3_THRESHOLD 114
+#define SQR_TOOM4_THRESHOLD 422
+#define SQR_TOOM6_THRESHOLD 0 /* always */
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 40
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 540, 5}, { 22, 6}, { 12, 5}, { 25, 6}, \
+ { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \
+ { 13, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 43, 9}, { 11, 8}, { 29, 9}, \
+ { 15, 8}, { 35, 9}, { 19, 8}, { 43, 9}, \
+ { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \
+ { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \
+ { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
+ { 55,11}, { 31,10}, { 79,11}, { 47,10}, \
+ { 103,12}, { 31,11}, { 63,10}, { 135,11}, \
+ { 79,10}, { 167,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 159,12}, { 95,11}, { 191,13}, \
+ { 63,12}, { 127,11}, { 255,10}, { 511,11}, \
+ { 271,10}, { 543,11}, { 287,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \
+ { 1343,11}, { 351,12}, { 191,11}, { 383,10}, \
+ { 767,11}, { 415,10}, { 831,12}, { 223,11}, \
+ { 447,13}, { 127,12}, { 255,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,10}, { 1343, 9}, { 2687,12}, \
+ { 351,11}, { 703,13}, { 191,12}, { 383,11}, \
+ { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,14}, { 127,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \
+ { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 927,11}, \
+ { 1855,12}, { 959,11}, { 1919,10}, { 3839,13}, \
+ { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
+ { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \
+ { 3455,13}, { 895,12}, { 1855,13}, { 959,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \
+ { 2431,13}, { 4863,15}, { 1279,14}, { 2687,13}, \
+ { 5375,14}, { 2943,13}, { 5887,15}, { 1535,14}, \
+ { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \
+ { 7679,16}, { 1023,15}, { 2047,14}, { 4479,15}, \
+ { 2303,14}, { 4991,15}, { 2559,14}, { 5247,15}, \
+ { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \
+ { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \
+ { 2047,15}, { 4095,14}, { 8191,15}, { 4351,14}, \
+ { 8959,15}, { 4863,16}, { 2559,15}, { 5375,14}, \
+ { 11007,15}, { 5887,14}, { 11775,16}, { 3071,15}, \
+ { 6911,16}, { 3583,15}, { 7167,14}, { 14335,15}, \
+ { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \
+ { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \
+ { 9215,14}, { 18431,15}, { 9727,14}, { 19455,15}, \
+ { 9983,14}, { 19967,16}, { 5119,15}, { 11007,16}, \
+ { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 271
+#define MUL_FFT_THRESHOLD 6272
+
+#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 404, 5}, { 13, 4}, { 27, 5}, { 21, 6}, \
+ { 11, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 14, 5}, { 29, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 25, 8}, \
+ { 13, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \
+ { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \
+ { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \
+ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \
+ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \
+ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \
+ { 135,11}, { 79,10}, { 159,11}, { 95,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \
+ { 143,10}, { 287, 9}, { 575,11}, { 159,12}, \
+ { 95,11}, { 191,13}, { 63,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 271,10}, { 543,11}, \
+ { 287,10}, { 575,11}, { 303,12}, { 159,11}, \
+ { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \
+ { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \
+ { 735,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 399,10}, { 799,11}, { 415,10}, { 831,12}, \
+ { 223,11}, { 447,10}, { 895,13}, { 127,12}, \
+ { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \
+ { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \
+ { 703,10}, { 1407,11}, { 735,10}, { 1471,13}, \
+ { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \
+ { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \
+ { 447,11}, { 895,14}, { 127,13}, { 255,12}, \
+ { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \
+ { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \
+ { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \
+ { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \
+ { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \
+ { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \
+ { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \
+ { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1471,11}, { 2943,14}, { 383,13}, { 767,12}, \
+ { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \
+ { 895,12}, { 1855,13}, { 959,15}, { 255,14}, \
+ { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \
+ { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \
+ { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \
+ { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \
+ { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \
+ { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \
+ { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \
+ { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \
+ { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \
+ { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \
+ { 2175,13}, { 4479,12}, { 8959,14}, { 2431,13}, \
+ { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \
+ { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \
+ { 1791,14}, { 3839,13}, { 7679,14}, { 3967,16}, \
+ { 1023,15}, { 2047,14}, { 4479,15}, { 2303,14}, \
+ { 4991,15}, { 2559,14}, { 5247,15}, { 2815,14}, \
+ { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \
+ { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \
+ { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \
+ { 8191,15}, { 4351,14}, { 8959,15}, { 4863,14}, \
+ { 9727,16}, { 2559,15}, { 5887,14}, { 11775,16}, \
+ { 3071,15}, { 6911,16}, { 3583,15}, { 7167,14}, \
+ { 14335,15}, { 7679,14}, { 15359,15}, { 7935,14}, \
+ { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \
+ { 4607,15}, { 9215,14}, { 18431,15}, { 9727,14}, \
+ { 19455,15}, { 9983,14}, { 19967,16}, { 5119,15}, \
+ { 10239,16}, { 5631,15}, { 11775,17}, { 3071,16}, \
+ { 6655,15}, { 13311,16}, { 65536,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 302
+#define SQR_FFT_THRESHOLD 4224
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 69
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 12
+#define SQRLO_DC_THRESHOLD 82
+#define SQRLO_SQR_THRESHOLD 8207
+
+#define DC_DIV_QR_THRESHOLD 76
+#define DC_DIVAPPR_Q_THRESHOLD 232
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 37
+#define INV_NEWTON_THRESHOLD 274
+#define INV_APPR_THRESHOLD 230
+
+#define BINV_NEWTON_THRESHOLD 372
+#define REDC_1_TO_REDC_N_THRESHOLD 68
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1718
+#define MUPI_DIV_QR_THRESHOLD 108
+#define MU_BDIV_QR_THRESHOLD 1470
+#define MU_BDIV_Q_THRESHOLD 1787
+
+#define POWM_SEC_TABLE 3,22,81,494
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 486
+#define SET_STR_PRECOMPUTE_THRESHOLD 1264
+
+#define FAC_DSC_THRESHOLD 187
+#define FAC_ODD_THRESHOLD 0 /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD 23
+#define HGCD2_DIV1_METHOD 1 /* 9.20% faster than 3 */
+#define HGCD_THRESHOLD 109
+#define HGCD_APPR_THRESHOLD 104
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 566
+#define GCDEXT_DC_THRESHOLD 382
+#define JACOBI_BASE_METHOD 1 /* 15.55% faster than 3 */
+
+/* Tuneup completed successfully, took 281243 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm
new file mode 100644
index 0000000..48dcf61
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_hamdist -- hamming distance.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/coreinhm/hamdist.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshift.asm b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm
new file mode 100644
index 0000000..4dce319
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshift optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm
new file mode 100644
index 0000000..d52b194
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_lshiftc optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm
new file mode 100644
index 0000000..6a083ac
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm
@@ -0,0 +1,161 @@
+dnl AMD64 mpn_mul_1 for CPUs with mulx.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 -
+C AMD K10 -
+C AMD bd1 -
+C AMD bd2 -
+C AMD bd3 -
+C AMD bd4 4.4
+C AMD zen 2
+C AMD bobcat -
+C AMD jaguar -
+C Intel P4 -
+C Intel PNR -
+C Intel NHM -
+C Intel SBR -
+C Intel IBR -
+C Intel HWL ?
+C Intel BWL ?
+C Intel SKL ?
+C Intel atom -
+C Intel SLM -
+C VIA nano -
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`n_param', `%rdx') C r8
+define(`v0_param',`%rcx') C r9
+
+define(`n', `%rcx')
+define(`v0', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ jmp L(ent)
+EPILOGUE()
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ FUNC_ENTRY(4)
+ xor R32(%r8), R32(%r8) C carry-in limb
+L(ent): mov (up), %r9
+
+ push %rbx
+ push %r12
+ push %r13
+
+ lea (up,n_param,8), up
+ lea -32(rp,n_param,8), rp
+ mov R32(n_param), R32(%rax)
+ xchg v0_param, v0 C FIXME: is this insn fast?
+
+ neg n
+
+ and $3, R8(%rax)
+ jz L(b0)
+ cmp $2, R8(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): mov %r8, %r12
+ mulx( %r9, %rbx, %rax)
+ sub $-1, n
+ jz L(wd1)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ add %r12, %rbx
+ jmp L(lo1)
+
+L(b3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax
+ sub $-3, n
+ jz L(wd3)
+ add %r8, %r11
+ jmp L(lo3)
+
+L(b2): mov %r8, %r10 C carry-in limb
+ mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax
+ sub $-2, n
+ jz L(wd2)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ add %r10, %r13
+ jmp L(lo2)
+
+L(b0): mov %r8, %rax C carry-in limb
+ mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ add %rax, %r9
+ jmp L(lo0)
+
+L(top): jrcxz L(end)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(top)
+
+L(end): mov %r9, (rp)
+L(wd3): adc %r8, %r11
+ mov %r11, 8(rp)
+L(wd2): adc %r10, %r13
+ mov %r13, 16(rp)
+L(wd1): adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+
+ pop %r13
+ pop %r12
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm
new file mode 100644
index 0000000..affa3b6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm
@@ -0,0 +1,455 @@
+dnl AMD64 mpn_mul_basecase optimised for AMD Zen.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider
+C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead
+C of 8 product registers.
+C * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops
+C without branch tree.
+C * Improve the overlapped software pipelining. The mulx in the osp block now
+C suffers from write/read conflicts, in particular the 1 mod 4 case. Also,
+C mul_1 could osp into addmul_1.
+C * Let vn_param be vn to save a copy.
+C * Re-allocate to benefit more from 32-bit encoding.
+C * Poor performance for e.g. n = 12,16.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')
+define(`vn_param', `%r8')
+
+define(`un', `%r14')
+define(`vp', `%rbp')
+define(`v0', `%rdx')
+define(`n', `%rcx')
+define(`vn', `%r15')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+ cmp $2, un_param
+ ja L(gen)
+ mov (vp_param), %rdx
+ mulx( (up), %rax, %r9) C 0 1
+ je L(s2x)
+
+L(s11): mov %rax, (rp)
+ mov %r9, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(s2x): cmp $2, vn_param
+ mulx( 8,(up), %r8, %r10) C 1 2
+ je L(s22)
+
+L(s21): add %r8, %r9
+ adc $0, %r10
+ mov %rax, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(s22): add %r8, %r9 C 1
+ adc $0, %r10 C 2
+ mov 8(vp_param), %rdx
+ mov %rax, (rp)
+ mulx( (up), %r8, %r11) C 1 2
+ mulx( 8,(up), %rax, %rdx) C 2 3
+ add %r11, %rax C 2
+ adc $0, %rdx C 3
+ add %r8, %r9 C 1
+ adc %rax, %r10 C 2
+ adc $0, %rdx C 3
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(gen): push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov un_param, un
+ mov vp_param, vp
+ mov vn_param, vn
+
+ mov (up), %r9
+ mov (vp), v0
+
+ lea (up,un,8), up
+ lea -32(rp,un,8), rp
+
+ neg un
+ mov un, n
+ test $1, R8(un)
+ jz L(mx0)
+L(mx1): test $2, R8(un)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ inc n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ sub $-3, n
+ jz L(mwd3)
+ test R32(%rdx), R32(%rdx)
+ jmp L(mlo3)
+
+L(mx0): test $2, R8(un)
+ jz L(mb0)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ lea 2(n), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ jmp L(mlo2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12
+ jmp L(mlo0)
+
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+L(mwd3):mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+ mov %rax, 32(rp)
+ add $8, vp
+ dec vn
+ jz L(end)
+
+C The rest of the file are 4 osp loops around addmul_1
+
+ test $1, R8(un)
+ jnz L(0x1)
+
+L(0x0): test $2, R8(un)
+ jnz L(oloop2_entry)
+
+L(oloop0_entry):
+ C initial feed-in block
+ mov (vp), %rdx
+ add $8, vp
+ mov un, n
+ add $8, rp
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 24(up,un,8), %rbx, %rax
+ add %r8, %r11
+ jmp L(lo0)
+
+L(oloop0):
+ C overlapped software pipelining block
+ mov (vp), %rdx C new
+ add $8, vp
+ add %r9, (rp) C prev
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8
+ adc %r11, 8(rp) C prev
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 0x8(%rsi,%r14,8),%r11,%r10
+ adc %r13, 16(rp) C prev
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 0x10(%rsi,%r14,8),%r13,%r12
+ adc %rbx, 24(rp) C prev
+ mov un, n
+ adc $0, %rax C prev
+ mov %rax, 32(rp) C prev
+ add $8, rp
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%rbx,%rax
+ add %r8, %r11 C new
+ jmp L(lo0)
+
+ ALIGN(16)
+L(tp0): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+L(lo0): adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp0)
+
+ dec vn
+ jne L(oloop0)
+
+ jmp L(final_wind_down)
+
+L(oloop2_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 2(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ add %r13, 16(rp,n,8)
+ jmp L(lo2)
+
+L(oloop2):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax
+ lea 2(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r11,%r10
+ add %r13, 16(rp,n,8)
+ jmp L(lo2)
+
+ ALIGN(16)
+L(tp2): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp2)
+
+ dec vn
+ jne L(oloop2)
+
+ jmp L(final_wind_down)
+
+L(0x1): test $2, R8(un)
+ jz L(oloop3_entry)
+
+L(oloop1_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 1(un), n
+ add $8, rp
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ add %rbx, 24(rp,n,8)
+ jmp L(lo1)
+
+L(oloop1):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8
+ adc %r11, 8(rp)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r13,%r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax
+ lea 1(un), n
+ add $8, rp
+ add %rbx, 24(rp,n,8)
+ jmp L(lo1)
+
+ ALIGN(16)
+L(tp1): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+L(lo1): adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp1)
+
+ dec vn
+ jne L(oloop1)
+
+ jmp L(final_wind_down)
+
+L(oloop3_entry):
+ mov (vp), %rdx
+ add $8, vp
+ lea 3(un), n
+ add $8, rp
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ test n, n
+ jz L(wd3)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ add %r11, 8(rp,n,8)
+ jmp L(lo3)
+
+L(oloop3):
+ mov (vp), %rdx
+ add $8, vp
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ adc %r13, 16(rp)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ lea 3(un), n
+ add $8, rp
+ add %r10, %r13
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r12, %rbx
+ adc $0, %rax
+ add %r11, 8(rp,n,8)
+ jmp L(lo3)
+
+ ALIGN(16)
+L(tp3): add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(tp3)
+
+ dec vn
+ jne L(oloop3)
+
+L(final_wind_down):
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(end): pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(3): mov (vp), %rdx
+ add $8, vp
+ add $8, rp
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+L(wd3): adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ dec vn
+ jne L(3)
+ jmp L(end)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm
new file mode 100644
index 0000000..2ae729a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm
@@ -0,0 +1,299 @@
+dnl X64-64 mpn_mullo_basecase optimised for AMD Zen.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`nn', `%rbp')
+
+C TODO
+C * Rearrange feed-in jumps for short branch forms.
+C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in
+C code implodes, the blow-up will not be more than perhaps 2.5x.
+C * Micro-optimise critical lead-in code blocks.
+C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc.
+C * Write n < 4 code specifically for Zen (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, R32(n)
+ jae L(big)
+
+ mov vp_param, vp
+ mov (up), %rdx
+
+ cmp $2, R32(n)
+ jae L(gt1)
+L(n1): imul (vp), %rdx
+ mov %rdx, (rp)
+ FUNC_EXIT()
+ ret
+L(gt1): ja L(gt2)
+L(n2): mov (vp), %r9
+ mulx( %r9, %rax, %rdx)
+ mov %rax, (rp)
+ mov 8(up), %rax
+ imul %r9, %rax
+ add %rax, %rdx
+ mov 8(vp), %r9
+ mov (up), %rcx
+ imul %r9, %rcx
+ add %rcx, %rdx
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+L(gt2):
+L(n3): mov (vp), %r9
+ mulx( %r9, %rax, %r10) C u0 x v0
+ mov %rax, (rp)
+ mov 8(up), %rdx
+ mulx( %r9, %rax, %rdx) C u1 x v0
+ imul 16(up), %r9 C u2 x v0
+ add %rax, %r10
+ adc %rdx, %r9
+ mov 8(vp), %r8
+ mov (up), %rdx
+ mulx( %r8, %rax, %rdx) C u0 x v1
+ add %rax, %r10
+ adc %rdx, %r9
+ imul 8(up), %r8 C u1 x v1
+ add %r8, %r9
+ mov %r10, 8(rp)
+ mov 16(vp), %r10
+ mov (up), %rax
+ imul %rax, %r10 C u0 x v2
+ add %r10, %r9
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(big): push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov (up), %r9
+ lea -8(up,n,8), up
+ lea -40(rp,n,8), rp
+
+ mov $4, R32(%r14)
+ sub n, %r14
+ mov -8(vp_param,n,8), %rbp
+ imul %r9, %rbp
+ lea 8(vp_param), %r15
+ mov (vp_param), %rdx
+
+ test $1, R8(%r14)
+ jnz L(mx0)
+L(mx1): test $2, R8(%r14)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ lea -2(%r14), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+ lea (%r14), n
+ jrcxz L(x)
+ jmp L(mlo3)
+L(x): jmp L(mcor)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax
+ lea -1(%r14), n
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8
+ jmp L(mlo2)
+
+L(mx0): test $2, R8(%r14)
+ jz L(mb2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12
+ lea -3(%r14), n
+ jmp L(mlo0)
+
+ ALIGN(16)
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+ mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ mov %rbx, 24(rp)
+
+L(outer):
+ mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov (%r15), %rdx
+ add $8, %r15
+ mov -24(up,%r14,8), %r8
+ lea -8(up), up
+
+ test $1, R8(%r14)
+ jz L(x0)
+L(x1): test $2, R8(%r14)
+ jnz L(b3)
+
+L(b1): mulx( %r8, %rbx, %rax)
+ lea -1(%r14), n
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10
+ jmp L(lo1)
+
+L(x0): test $2, R8(%r14)
+ jz L(b2)
+
+L(b0): mulx( %r8, %r9, %r8)
+ lea -2(%r14), n
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12
+ jmp L(lo0)
+
+L(b3): mulx( %r8, %r11, %r10)
+ lea 1(%r14), n
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jrcxz L(cor)
+ jmp L(lo3)
+
+L(cor): add 8(rp), %r11
+ mov 16(rp), %r10
+ mov 24(rp), %r12
+L(mcor):mov %r11, 8(rp)
+ adc %r10, %r13
+ adc %r12, %rbx
+ mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov (%r15), %rdx
+ mov -24(up), %r8
+ mulx( %r8, %r9, %r12)
+ mulx( -16,(up), %r14, %rax)
+ add %r12, %r14
+ adc $0, %rax
+ adc %r9, %r13
+ mov %r13, 16(rp)
+ adc %r14, %rbx
+ mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov 8(%r15), %rdx
+ mulx( -24,(up), %r14, %rax)
+ add %r14, %rbx
+ mov %rbx, 24(rp)
+ mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?)
+ adc %rax, %rbp
+ add %r10, %rbp
+ mov %rbp, 32(rp)
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(b2): mulx( %r8, %r13, %r12)
+ lea (%r14), n
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8
+ jmp L(lo2)
+
+ ALIGN(16)
+L(top): add %r9, (rp,n,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ js L(top)
+
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ inc %r14
+ jmp L(outer)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/popcount.asm b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm
new file mode 100644
index 0000000..be1613b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm
@@ -0,0 +1,38 @@
+dnl AMD64 mpn_popcount -- population count.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/coreinhm/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/rshift.asm b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm
new file mode 100644
index 0000000..0196870
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_rshift optimised for AMD Zen.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..0c24de5
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm
@@ -0,0 +1,507 @@
+dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(`up', `%rdi')
+define(`un_param', `%rsi')
+define(`dp_param', `%rdx')
+define(`dn_param', `%rcx')
+define(`dinv', `%r8')
+
+define(`i', `%rcx')
+define(`dn', `%r14')
+
+define(`dp', `%rsi')
+define(`un', `%r15')
+
+C TODO
+C * The o1...o8 loops for special dn counts were naively hand-optimised by
+C folding the generic loops. They can probably be tuned. The speculative
+C quotient limb generation might not be in the optimal spot.
+C * Perhaps avoid late-in-loop jumps, e.g., lo0.
+C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
+
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), dinv ')
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+
+ sub dn_param, un_param C outer loop count
+ mov dn_param, dn C FIXME: Suppress by reg re-alloc
+ push dinv C keep dinv on stack
+ mov un_param, un C FIXME: Suppress by reg re-alloc
+ xor R32(%rbp), R32(%rbp)
+
+ lea (dp_param,dn_param,8), dp
+
+ mov (up), %rdx
+ imul dinv, %rdx C first quotient limb
+
+ neg dn
+ lea -32(up,dn_param,8), up
+
+ test $1, R8(dn_param)
+ jnz L(cx1)
+
+L(cx0): test $2, R8(dn_param)
+ jnz L(b2)
+
+
+C =============================================================================
+L(b0): cmp $-4, dn
+ jnz L(gt4)
+
+L(o4): mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add (up), %r9
+ adc 8(up), %r11
+ mov %r8, %rdx C dinv
+ mov %r11, 8(up)
+ mulx( %r11, %rdx, %r12) C next quotient
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o4)
+ jmp L(ret)
+
+L(gt4): cmp $-8, dn
+ jnz L(out0)
+
+L(o8): mulx( -64,(dp), %r9, %r14)
+ mulx( -56,(dp), %rcx, %r10)
+ mulx( -48,(dp), %r13, %r12)
+ mulx( -40,(dp), %rbx, %rax)
+ add %r14, %rcx
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add -32(up), %r9
+ mulx( -32,(dp), %r9, %r14)
+ adc -24(up), %rcx
+ mov %rcx, -24(up)
+ mulx( -24,(dp), %r11, %r10)
+ adc %r13, -16(up)
+ mulx( -16,(dp), %r13, %r12)
+ adc %rbx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o8)
+ jmp L(ret)
+
+L(out0):mov dn, i
+ .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12
+ clc
+ jmp L(lo0)
+
+ ALIGN(16)
+L(top0):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top0)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out0)
+ jmp L(ret)
+
+L(cx1): test $2, R8(dn_param)
+ jnz L(b3)
+
+C =============================================================================
+L(b1): cmp $-1, dn
+ jnz L(gt1)
+
+ mov 24(up), %r9
+L(o1): mulx( -8,(dp), %rbx, %rdx)
+ add %r9, %rbx
+ adc %rbp, %rdx
+ add 32(up), %rdx
+ setc R8(%rbp)
+ mov %rdx, %r9
+ mulx( %r8, %rdx, %r12) C next quotient
+ lea 8(up), up
+ dec un
+ jne L(o1)
+ mov %r9, 24(up)
+ jmp L(ret)
+
+L(gt1): cmp $-5, dn
+ jnz L(out1)
+
+L(o5): mulx( -40,(dp), %rbx, %rax)
+ mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ add -8(up), %rbx
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add (up), %r9
+ mov %r9, (up)
+ mov %r8, %rdx C dinv
+ mulx( %r9, %rdx, %r12) C next quotient
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o5)
+ jmp L(ret)
+
+L(out1):lea 1(dn), i
+ .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8
+ .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10
+ clc
+ jmp L(lo1)
+
+ ALIGN(16)
+L(top1):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top1)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out1)
+ jmp L(ret)
+
+C =============================================================================
+L(b2): cmp $-2, dn
+ jnz L(gt2)
+
+ mov 16(up), %r10
+ mov 24(up), %r9
+L(o2): mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r12, %rbx
+ adc $0, %rax
+ add %r10, %r13 C 0 add just to produce carry
+ mov %r9, %r10 C 1
+ adc %rbx, %r10 C 1
+ mov %r8, %rdx
+ mulx( %r10, %rdx, %r12) C next quotient
+ adc %rbp, %rax C 2
+ setc R8(%rbp) C 3
+ mov 32(up), %r9 C 2
+ add %rax, %r9 C 2
+ adc $0, R32(%rbp) C 3
+ lea 8(up), up
+ dec un
+ jne L(o2)
+ mov %r10, 16(up)
+ mov %r9, 24(up)
+ jmp L(ret)
+
+L(gt2): cmp $-6, dn
+ jnz L(out2)
+
+L(o6): mulx( -48,(dp), %r13, %r12)
+ mulx( -40,(dp), %rcx, %rax)
+ add %r12, %rcx
+ adc $0, %rax
+ mulx( -32,(dp), %r9, %r14)
+ mulx( -24,(dp), %r11, %r10)
+ add -16(up), %r13
+ mulx( -16,(dp), %r13, %r12)
+ adc -8(up), %rcx
+ mov %rcx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o6)
+ jmp L(ret)
+
+L(out2):lea 2(dn), i
+ .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (dp,dn,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax
+ add %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8
+ jmp L(lo2)
+
+ ALIGN(16)
+L(top2):add %r9, (up,i,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top2)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out2)
+ jmp L(ret)
+
+C =============================================================================
+L(b3): cmp $-3, dn
+ jnz L(gt3)
+
+ mov 8(up), %r14
+ mov 16(up), %r9
+ mov 24(up), %rcx
+L(o3): mulx( -24,(dp), %r11, %r10)
+ mulx( -16,(dp), %r13, %r12)
+ mulx( -8,(dp), %rbx, %rax)
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add %r14, %r11
+ mov %r9, %r14
+ adc %r13, %r14
+ mov %rcx, %r9
+ mov %r8, %rdx C dinv
+ mulx( %r14, %rdx, %r12) C next quotient
+ adc %rbx, %r9
+ adc %rbp, %rax
+ setc R8(%rbp)
+ mov 32(up), %rcx
+ add %rax, %rcx
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o3)
+ mov %r14, 8(up)
+ mov %r9, 16(up)
+ mov %rcx, 24(up)
+ jmp L(ret)
+
+L(gt3): cmp $-7, dn
+ jnz L(out3)
+
+L(o7): mulx( -56,(dp), %r11, %r10)
+ mulx( -48,(dp), %rcx, %r12)
+ mulx( -40,(dp), %rbx, %rax)
+ add %r10, %rcx
+ adc %r12, %rbx
+ adc $0, %rax
+ mulx( -32,(dp), %r9, %r14)
+ add -24(up), %r11
+ mulx( -24,(dp), %r11, %r10)
+ adc -16(up), %rcx
+ mov %rcx, -16(up)
+ mulx( -16,(dp), %r13, %r12)
+ adc %rbx, -8(up)
+ adc %rax, %r9
+ mulx( -8,(dp), %rbx, %rax)
+ adc %r14, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %r8, %rdx C dinv
+ mulx( %rcx, %rdx, %r12) C next quotient
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(o7)
+ jmp L(ret)
+
+L(out3):lea 3(dn), i
+ .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (dp,dn,8),%r11,%r10
+ .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12
+ .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax
+ add %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jmp L(lo3)
+
+ ALIGN(16)
+L(top3):add %r9, (up,i,8)
+L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8
+ adc %r11, 8(up,i,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10
+ adc %r13, 16(up,i,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12
+ adc %rbx, 24(up,i,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, i
+ js L(top3)
+
+ mov (%rsp), %rdx C dinv
+ .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12
+ add %r9, (up)
+ adc %r11, 8(up)
+ adc %r13, 16(up)
+ adc %rbx, 24(up)
+ adc %rbp, %rax
+ setc R8(%rbp)
+ add %rax, 32(up)
+ adc $0, R32(%rbp)
+ lea 8(up), up
+ dec un
+ jne L(out3)
+
+L(ret): mov %rbp, %rax
+ pop %rsi C dummy dealloc
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm
new file mode 100644
index 0000000..a7c6127
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm
@@ -0,0 +1,482 @@
+dnl AMD64 mpn_sqr_basecase optimised for AMD Zen.
+
+dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Do overlapped software pipelining. This should close the remaining gap to
+C mul_basecase.
+C
+C * Update un just once in the outer loop.
+C
+C * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from
+C loads and stores. At least in some cases, the non-scaled form is faster.
+C
+C * Optimise xit3 code, e.g., using shrx and sarx like in the main loop.
+C
+C * The mul_1 feed-in code has gotten little attention and could probably be
+C improved. Perhaps even expand it to 4 separate loops to allow straight
+C fall-through into the 4 addmul_1 loops.
+C
+C * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+
+define(`un', `%rbp')
+define(`n', `%rcx')
+
+C these are used just for the small op code
+define(`w0', `%r8')
+define(`w1', `%r9')
+define(`w2', `%r10')
+define(`w3', `%r11')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+
+ cmp $2, R32(un_param)
+ jae L(gt1)
+
+ mov (up), %rdx
+ mulx( %rdx, %rax, %rdx)
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt1): jne L(gt2)
+
+ mov (up), %rdx
+ mov 8(up), %rcx
+ mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
+ mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
+ mov %rcx, %rdx
+ mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
+ add %r9, %r9 C W 1
+ adc %r10, %r10 C W 2
+ adc $0, %rdx C W 3
+ add %r9, %r8 C W 1
+ adc %r11, %r10 C W 2
+ adc $0, %rdx C W 3
+ mov %rax, (rp)
+ mov %r8, 8(rp)
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(gt2): cmp $4, R32(un_param)
+ jae L(gt3)
+
+ push %rbx
+ mov (up), %rdx
+ mulx( 8,(up), w2, w3)
+ mulx( 16,(up), w0, w1)
+ add w3, w0
+ mov 8(up), %rdx
+ mulx( 16,(up), %rax, w3)
+ adc %rax, w1
+ adc $0, w3
+ test R32(%rbx), R32(%rbx)
+ mov (up), %rdx
+ mulx( %rdx, %rbx, %rcx)
+ mov %rbx, (rp)
+ mov 8(up), %rdx
+ mulx( %rdx, %rax, %rbx)
+ mov 16(up), %rdx
+ mulx( %rdx, %rsi, %rdx)
+ adcx( w2, w2)
+ adcx( w0, w0)
+ adcx( w1, w1)
+ adcx( w3, w3)
+ adox( w2, %rcx)
+ adox( w0, %rax)
+ adox( w1, %rbx)
+ adox( w3, %rsi)
+ mov $0, R32(%r8)
+ adox( %r8, %rdx)
+ adcx( %r8, %rdx)
+ mov %rcx, 8(rp)
+ mov %rax, 16(rp)
+ mov %rbx, 24(rp)
+ mov %rsi, 32(rp)
+ mov %rdx, 40(rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(gt3): push %r15
+C push %r14
+ push %r13
+ push %r12
+ push %rbp
+ push %rbx
+ mov R32(un_param), R32(un)
+
+ mov (up), %rdx C up[0]
+ mov 8(up), %r9 C up[1]
+
+ mulx( %rdx, %rax, %r15) C up[0]^2
+ mov %rax, (rp)
+ shl %rdx
+
+ lea (up,un,8), up
+ lea -32(rp,un,8), rp
+
+ neg un
+ lea 4(un), n
+ and $-4, n
+
+ test $1, R8(un)
+ jnz L(mx0)
+L(mx1): test $2, R8(un)
+ jz L(mb3)
+
+L(mb1): mulx( %r9, %rbx, %rax)
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18 C mulx 24(up,un,8), %r11, %r10
+ add %r15, %rbx
+ jmp L(mlo1)
+
+L(mb3): mulx( %r9, %r11, %r10)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x10 C mulx 16(up,un,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %rbx, %rax
+ add %r15, %r11
+ jrcxz L(n4)
+ jmp L(mlo3)
+L(n4): mov %r11, 8(rp)
+ adc %r10, %r13
+ adc %r12, %rbx
+ jmp L(m)
+
+L(mx0): test $2, R8(un)
+ jnz L(mb0)
+
+L(mb2): mulx( %r9, %r13, %r12)
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %rbx, %rax
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %r9, %r8
+ add %r15, %r13
+ jmp L(mlo2)
+
+L(mb0): mulx( %r9, %r9, %r8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10 C mulx 16(up,un,8), %r11, %r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x18 C mulx 24(up,un,8), %r13, %r12
+ add %r15, %r9
+ jmp L(mlo0)
+
+ ALIGN(16)
+L(mtop):jrcxz L(mend)
+ adc %r8, %r11
+ mov %r9, (rp,n,8)
+L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r10, %r13
+ mov %r11, 8(rp,n,8)
+L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r12, %rbx
+ mov %r13, 16(rp,n,8)
+L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rax, %r9
+ mov %rbx, 24(rp,n,8)
+L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ lea 4(n), n
+ jmp L(mtop)
+
+L(mend):mov %r9, (rp)
+ adc %r8, %r11
+ mov %r11, 8(rp)
+ adc %r10, %r13
+ mov %r13, 16(rp)
+ adc %r12, %rbx
+ adc $0, %rax
+ mov %rbx, 24(rp)
+ mov %rax, 32(rp)
+
+ lea 2(un), un
+
+ mov $63, R32(%r15) C keep at 63 for shrx/sarx.
+ test $1, R8(un)
+ jz L(x0)
+L(x1): test $2, R8(un)
+ jz L(f3)
+ jmp L(f1)
+L(x0): test $2, R8(un)
+ jz L(f0)
+C jmp L(f2)
+
+L(f2): mov -8(up,un,8), %rdx C up[0]
+ lea 2(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r11
+ .byte 0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r13
+ and %rdx, %r11 C "ci" in C code
+ mulx( %rdx, %rax, %r10) C up[0]^2
+ lea (%r13,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r11
+
+ .byte 0xc4,0x62,0x93,0xf6,0x24,0xee C mulx (up,un,8), %r13, %r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %rbx, %rax
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jmp L(b2)
+
+ ALIGN(16)
+L(top2):add %r9, (rp,n,8)
+L(b2): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top2)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f1): mov -8(up,un,8), %rdx C up[0]
+ lea 1(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r13
+ .byte 0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %rbx
+ and %rdx, %r13 C "ci" in C code
+ mulx( %rdx, %rax, %r12) C up[0]^2
+ lea (%rbx,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r13
+
+ .byte 0xc4,0xe2,0xe3,0xf6,0x04,0xee C mulx (up,un,8), %rbx, %rax
+ adc %r12, %rbx
+ adc $0, %rax
+ .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %r9, %r8
+ jmp L(b1)
+
+ ALIGN(16)
+L(top1):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+L(b1): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top1)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f0): mov -8(up,un,8), %rdx C up[0]
+ lea (un), n
+ lea 8(rp), rp
+ .byte 0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %rbx
+ .byte 0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r9
+ and %rdx, %rbx C "ci" in C code
+ mulx( %rdx, %r10, %rax) C up[0]^2
+ lea (%r9,%rdx,2), %rdx C "u0" arg in C code
+ add %r10, %rbx
+ adc $0, %rax C "cin" in C code
+
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,un,8), %r9, %r8
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08 C mulx 8(up,un,8), %r11, %r10
+ jmp L(b0)
+
+ ALIGN(16)
+L(top0):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+L(b0): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top0)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+
+L(f3): mov -8(up,un,8), %rdx C up[0]
+ lea 3(un), n
+ lea 8(rp), rp
+ .byte 0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r9
+ .byte 0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r11
+ and %rdx, %r9 C "ci" in C code
+ mulx( %rdx, %rax, %r8) C up[0]^2
+ lea (%r11,%rdx,2), %rdx C "u0" arg in C code
+ add %rax, %r9
+
+ .byte 0xc4,0x62,0xa3,0xf6,0x14,0xee C mulx (%rsi,%rbp,8),%r11,%r10
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x08 C mulx 0x8(%rsi,%rbp,8),%r13,%r12
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 0x10(%rsi,%rbp,8),%rbx,%rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ jrcxz L(xit3)
+ jmp L(top3) C FIXME perhaps fall through
+
+ ALIGN(16)
+L(top3):add %r9, (rp,n,8)
+ .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8
+ adc %r11, 8(rp,n,8)
+ .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10
+ adc %r13, 16(rp,n,8)
+ .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12
+ adc %rbx, 24(rp,n,8)
+ adc %rax, %r9
+ .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax
+ adc %r8, %r11
+ adc %r10, %r13
+ adc %r12, %rbx
+ adc $0, %rax
+ add $4, n
+ jnz L(top3)
+
+ inc un
+ add %r9, (rp)
+ adc %r11, 8(rp)
+ adc %r13, 16(rp)
+ adc %rbx, 24(rp)
+ adc $0, %rax
+ mov %rax, 32(rp)
+ jmp L(f2)
+
+
+L(xit3):add %r9, (rp)
+ adc %r11, 8(rp)
+ adc 16(rp), %r13
+ adc 24(rp), %rbx
+L(m): adc $0, %rax
+ mov %rax, 32(rp)
+ mov -24(up), %rdx C FIXME: CSE
+ mov -32(up), %r9 C FIXME: CSE
+ sar $63, %r9
+ and %rdx, %r9
+ add %r13, %r9
+ mulx( %rdx, %rax, %r10)
+ mov -16(up), %r8 C FIXME: CSE
+ adc $0, %r10
+ add %rax, %r9
+ adc $0, %r10
+ mov %r9, 16(rp)
+ mov -32(up), %rax
+ shl %rax
+ adc %rdx, %rdx
+ mulx( %r8, %r13, %r12)
+ mulx( -8,(up), %r11, %rax) C FIXME: CSE
+ add %r10, %r13
+ adc %r12, %r11
+ adc $0, %rax
+ add %rbx, %r13
+ mov %r13, 24(rp)
+ adc 32(rp), %r11
+ adc $0, %rax
+ mov -16(up), %rdx C FIXME: CSE
+ mov -8(up), %r8 C FIXME: CSE
+ mov -24(up), %r9
+ sar $63, %r9
+ and %rdx, %r9
+ add %r11, %r9
+ mulx( %rdx, %rbp, %r10)
+ adc $0, %r10
+ add %rbp, %r9
+ adc $0, %r10
+ mov %r9, 32(rp)
+ mov -24(up), %rbp
+ shl %rbp
+ adc %rdx, %rdx
+ mulx( %r8, %rbx, %rbp)
+ add %r10, %rbx
+ adc $0, %rbp
+ adc %rbx, %rax
+ mov %rax, 40(rp)
+ adc $0, %rbp
+ mov -8(up), %rdx C FIXME: CSE
+ mov -16(up), %r9 C FIXME: CSE
+ sar $63, %r9
+ and %rdx, %r9
+ add %rbp, %r9
+ mulx( %rdx, %rbp, %r10)
+ adc $0, %r10
+ add %rbp, %r9
+ adc $0, %r10
+ mov %r9, 48(rp)
+ mov %r10, 56(rp)
+
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+C pop %r14
+ pop %r15
+
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm
new file mode 100644
index 0000000..00f6dc9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm
@@ -0,0 +1,37 @@
+dnl X86-64 mpn_sublsh1_n, mpn_sublsh1_nc.
+
+dnl Copyright 2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')