aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/k8
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/k8
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/k8')
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm153
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm195
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm217
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm179
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm249
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h237
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm469
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm436
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm559
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/redc_1.asm591
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm807
11 files changed, 4092 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
new file mode 100644
index 0000000..3e1898b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
@@ -0,0 +1,153 @@
+dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl Copyright 2008, 2021 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.167
+C AMD K10 2.167
+C Intel P4 12.0
+C Intel core2 4.0
+C Intel corei ?
+C Intel atom ?
+C VIA nano ?
+
+C TODO
+C * Perhaps handle various n mod 3 sizes better. The code now is too large.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp_param', `%rdx')
+define(`n', `%rcx')
+define(`u0', `%r8')
+define(`v0', `%r9')
+
+
+define(`bp', `%rbp')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ push %rbp
+
+ lea (ap,n,8), ap
+ lea (bp_param,n,8), bp
+ lea (rp,n,8), rp
+ neg n
+
+ mov (ap,n,8), %rax
+ mul %r8
+ mov %rax, %r11
+ mov (bp,n,8), %rax
+ mov %rdx, %r10
+ add $3, n
+ jns L(end)
+
+ push %r13
+
+ ALIGN(16)
+L(top): mul %r9
+ add %rax, %r11
+ mov -16(ap,n,8), %rax
+ adc %rdx, %r10
+ mov %r11, -24(rp,n,8)
+ mul %r8
+ add %rax, %r10
+ mov -16(bp,n,8), %rax
+ mov $0, R32(%r13)
+ adc %rdx, %r13
+ mul %r9
+ add %rax, %r10
+ mov -8(ap,n,8), %rax
+ adc %rdx, %r13
+ mov %r10, -16(rp,n,8)
+ mul %r8
+ add %rax, %r13
+ mov -8(bp,n,8), %rax
+ mov $0, R32(%r11)
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r13
+ adc %rdx, %r11
+ mov (ap,n,8), %rax
+ mul %r8
+ add %rax, %r11
+ mov %r13, -8(rp,n,8)
+ mov (bp,n,8), %rax
+ mov $0, R32(%r10)
+ adc %rdx, %r10
+ add $3, n
+ js L(top)
+
+ pop %r13
+
+L(end): mul %r9
+ add %rax, %r11
+ adc %rdx, %r10
+ cmp $1, R32(n)
+ ja L(two)
+ mov -16(ap,n,8), %rax
+ mov %r11, -24(rp,n,8)
+ mov %r10, %r11
+ jz L(one)
+
+L(nul): mul %r8
+ add %rax, %r10
+ mov -16(bp), %rax
+ mov $0, R32(%r11)
+ adc %rdx, %r11
+ mul %r9
+ add %rax, %r10
+ mov -8(ap), %rax
+ adc %rdx, %r11
+ mov %r10, -16(rp)
+L(one): mul %r8
+ add %rax, %r11
+ mov -8(bp), %rax
+ mov $0, R32(%r10)
+ adc %rdx, %r10
+ mul %r9
+ add %rax, %r11
+ adc %rdx, %r10
+
+L(two): mov %r11, -8(rp)
+ mov %r10, %rax
+L(ret): pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
new file mode 100644
index 0000000..78bcba1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
@@ -0,0 +1,195 @@
+dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl add the result to a third limb vector.
+
+dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cfg cycles/limb am1+am1
+C AMD K8,K9 2.375
+C AMD K10 2.375
+C AMD bull 5.2 <- 4.6-4.75 bad
+C AMD pile 4.96 <- 4.6-4.75 bad
+C AMD steam ?
+C AMD excavator ?
+C AMD bobcat 5.75 5.0 bad
+C AMD jaguar 5.9 5.2-5.4 bad
+C Intel P4 15-16
+C Intel core2 4.5 4.25-4.5 bad
+C Intel NHM 4.33 4.55 bad
+C Intel SBR 3.4 2.93 3.24 bad
+C Intel IBR 3.35 2.6 2.95 bad
+C Intel HWL 3.3 2.15 2.3 bad
+C Intel BWL 2.33 2.33 1.65 bad
+C Intel SKL 2.37 2.21 1.64 bad
+C Intel atom 20 18.7
+C Intel SLM 8 8.5
+C VIA nano 4.4
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Tune feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param',`%rdx')
+define(`vp', `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n', `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+ FUNC_ENTRY(4)
+ mov n_param, n
+ push %rbx
+ push %rbp
+
+ mov 0(vp), v0
+ mov 8(vp), v1
+
+ mov R32(n_param), R32(%rbx)
+ mov (up), %rax
+ lea -8(up,n_param,8), up
+ lea -8(rp,n_param,8), rp
+ mul v0
+ neg n
+ and $3, R32(%rbx)
+ jz L(b0)
+ cmp $2, R32(%rbx)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, w1
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ mov 8(up,n,8), %rax
+ dec n
+ jmp L(lo3)
+
+L(b2): mov %rax, w2
+ mov 8(up,n,8), %rax
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ add $-2, n
+ jmp L(lo2)
+
+L(b1): mov %rax, w3
+ mov 8(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ inc n
+ jmp L(lo1)
+
+L(b0): mov $0, R32(w3)
+ mov %rax, w0
+ mov 8(up,n,8), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(lo0)
+
+ ALIGN(32)
+L(top): mov $0, R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(lo1): mul v1
+ add w3, (rp,n,8)
+ mov $0, R32(w3)
+ adc %rax, w0
+ mov $0, R32(w2)
+ mov 8(up,n,8), %rax
+ adc %rdx, w1
+ mul v0
+ add %rax, w0
+ mov 8(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(lo0): mul v1
+ add w0, 8(rp,n,8)
+ adc %rax, w1
+ adc %rdx, w2
+ mov 16(up,n,8), %rax
+ mul v0
+ add %rax, w1
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mov 16(up,n,8), %rax
+L(lo3): mul v1
+ add w1, 16(rp,n,8)
+ adc %rax, w2
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov 24(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+L(lo2): mul v1
+ add w2, 24(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov 32(up,n,8), %rax
+ add $4, n
+ js L(top)
+
+L(end): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov (up), %rax
+ adc %rdx, w0
+ adc R32(w1), R32(w1)
+ mul v1
+ add w3, (rp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(rp)
+ mov w1, %rax
+
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
new file mode 100644
index 0000000..ff3a184
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
@@ -0,0 +1,217 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.87 < 3.85 for lshift + add_n
+C AMD K10 2.75 < 3.85 for lshift + add_n
+C Intel P4 22 > 7.33 for lshift + add_n
+C Intel core2 4.1 > 3.27 for lshift + add_n
+C Intel NHM 4.4 > 3.75 for lshift + add_n
+C Intel SBR 3.17 < 3.46 for lshift + add_n
+C Intel atom ? ? 8.75 for lshift + add_n
+C VIA nano 4.7 < 6.25 for lshift + add_n
+
+C TODO
+C * Can we propagate carry into rdx instead of using a special carry register?
+C That could save enough insns to get to 10 cycles/iteration.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param', `%rcx')
+define(`cnt', `%r8')
+
+define(`vp', `%r12')
+define(`n', `%rbp')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADDSUB, `add')
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADDSUB, `sub')
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov (vp_param), %rax C load first V limb early
+
+ mov $0, R32(n)
+ sub n_param, n
+
+ lea -16(up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+ lea 16(vp_param,n_param,8), vp
+
+ mov n_param, %r9
+
+ mov %r8, %rcx
+ mov $1, R32(%r8)
+ shl R8(%rcx), %r8
+
+ mul %r8 C initial multiply
+
+ and $3, R32(%r9)
+ jz L(b0)
+ cmp $2, R32(%r9)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r11
+ ADDSUB 16(up,n,8), %r11
+ mov -8(vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov (vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $3, n
+ jnz L(lo3)
+ jmp L(cj3)
+
+L(b2): mov %rax, %rbx
+ mov -8(vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $2, n
+ jz L(cj2)
+ mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ xor R32(%rcx), R32(%rcx) C clear carry register
+ jmp L(lo2)
+
+L(b1): mov %rax, %r9
+ mov %rdx, %r10
+ add $1, n
+ jnz L(gt1)
+ ADDSUB 8(up,n,8), %r9
+ jmp L(cj1)
+L(gt1): mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ jmp L(lo1)
+
+L(b0): mov %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov 8(vp,n,8), %rax
+ add $4, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(lo3): mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %r11, -8(rp,n,8)
+L(lo2): mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rbx, (rp,n,8)
+L(lo1): mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov %r9, 8(rp,n,8)
+L(lo0): mov 8(vp,n,8), %rax
+ add $4, n
+ jnz L(top)
+
+L(end): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(cj3): mov %r11, -8(rp,n,8)
+L(cj2): add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ mov %rbx, (rp,n,8)
+L(cj1): mov %r9, 8(rp,n,8)
+ mov %rdx, %rax
+ ADCSBB $0, %rax
+ pop %rbx
+ pop %rbp
+ pop %r12
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
new file mode 100644
index 0000000..1172b0d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
@@ -0,0 +1,179 @@
+dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor,
+dnl returning quotient only.
+
+dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm/unorm
+C AMD K8,K9 10 +
+C AMD K10 10 +
+C AMD bull 13.7 -
+C AMD pile 13.7 +
+C AMD steam
+C AMD excavator
+C AMD bobcat 15 -
+C AMD jaguar 16 -
+C Intel P4 33 =
+C Intel core2 13.25 =
+C Intel NHM 14 =
+C Intel SBR 8.5 -
+C Intel IBR 8.5 -
+C Intel HWL 8 =
+C Intel BWL 8 =
+C Intel SKL 8 =
+C Intel atom 42 --
+C Intel SLM 20.4 --
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+define(`d', `%rcx')
+define(`di', `%r8') C just mpn_pi1_bdiv_q_1
+define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+ FUNC_ENTRY(4)
+ push %rbx
+
+ mov %rcx, %rax
+ xor R32(%rcx), R32(%rcx) C ncnt count
+ mov %rdx, %r10
+
+ bt $0, R32(%rax)
+ jnc L(evn) C skip bsf unless divisor is even
+
+L(odd): mov %rax, %rbx
+ shr R32(%rax)
+ and $127, R32(%rax) C d/2, 7 bits
+
+ LEA( binvert_limb_table, %rdx)
+
+ movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
+
+ mov %rbx, %r11 C d without twos
+
+ lea (%rax,%rax), R32(%rdx) C 2*inv
+ imul R32(%rax), R32(%rax) C inv*inv
+ imul R32(%rbx), R32(%rax) C inv*inv*d
+ sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
+
+ lea (%rdx,%rdx), R32(%rax) C 2*inv
+ imul R32(%rdx), R32(%rdx) C inv*inv
+ imul R32(%rbx), R32(%rdx) C inv*inv*d
+ sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
+
+ lea (%rax,%rax), %r8 C 2*inv
+ imul %rax, %rax C inv*inv
+ imul %rbx, %rax C inv*inv*d
+ sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits
+
+ jmp L(pi1)
+
+L(evn): bsf %rax, %rcx
+ shr R8(%rcx), %rax
+ jmp L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ push %rbx
+
+ mov %rcx, %r11 C d
+ mov %rdx, %r10 C n
+ mov %r9, %rcx C ncnt
+
+L(pi1): mov (up), %rax C up[0]
+
+ dec %r10
+ jz L(one)
+
+ mov 8(up), %rdx C up[1]
+ lea (up,%r10,8), up C up end
+ lea (rp,%r10,8), rp C rp end
+ neg %r10 C -n
+
+ shrd R8(%rcx), %rdx, %rax
+
+ xor R32(%rbx), R32(%rbx)
+ jmp L(ent)
+
+ ALIGN(8)
+L(top):
+ C rax q
+ C rbx carry bit, 0 or 1
+ C rcx ncnt
+ C rdx
+ C r10 counter, limbs, negative
+ C r11 d
+
+ mul %r11 C carry limb in rdx
+ mov (up,%r10,8), %rax
+ mov 8(up,%r10,8), %r9
+ shrd R8(%rcx), %r9, %rax
+ nop
+ sub %rbx, %rax C apply carry bit
+ setc R8(%rbx)
+ sub %rdx, %rax C apply carry limb
+ adc $0, R32(%rbx)
+L(ent): imul %r8, %rax
+ mov %rax, (rp,%r10,8)
+ inc %r10
+ jnz L(top)
+
+ mul %r11 C carry limb in rdx
+ mov (up), %rax C up high limb
+ shr R8(%rcx), %rax
+ sub %rbx, %rax C apply carry bit
+ sub %rdx, %rax C apply carry limb
+ imul %r8, %rax
+ mov %rax, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+L(one): shr R8(%rcx), %rax
+ imul %r8, %rax
+ mov %rax, (rp)
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..86de08c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
@@ -0,0 +1,249 @@
+dnl x86-64 mpn_div_qr_1n_pi1
+dnl -- Divide an mpn number by a normalized single-limb number,
+dnl using a single-limb inverse.
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C c/l
+C AMD K8,K9 11
+C AMD K10 11
+C AMD bull 16
+C AMD pile 14.25
+C AMD steam ?
+C AMD bobcat 16
+C AMD jaguar ?
+C Intel P4 47.5 poor
+C Intel core 28.5 very poor
+C Intel NHM 29 very poor
+C Intel SBR 16 poor
+C Intel IBR 13.5
+C Intel HWL 12
+C Intel BWL ?
+C Intel atom 53 very poor
+C VIA nano 19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx') C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8') C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+ ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+IFDOS(` mov 64(%rsp), %r9 ')
+ dec UN_INPUT
+ jnz L(first)
+
+ C Just a single 2/1 division.
+ C T, U0 are allocated in scratch registers
+ lea 1(U1), T
+ mov U1, %rax
+ mul DINV
+ mov (UP), U0
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(single_div_done)
+ sub D, %rax
+ add $1, T
+L(single_div_done):
+ mov T, (QP)
+ FUNC_EXIT()
+ ret
+L(first):
+ C FIXME: Could delay some of these until we enter the loop.
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %rbx
+ push %rbp
+
+ mov D, B2
+ imul DINV, B2
+ neg B2
+ mov B2, B2md
+ sub D, B2md
+
+ C D not needed until final reduction
+ push D
+ mov UN_INPUT, UN C Clobbers D
+
+ mov DINV, %rax
+ mul U1
+ mov %rax, Q0
+ add U1, %rdx
+ mov %rdx, T
+
+ mov B2, %rax
+ mul U1
+ mov -8(UP, UN, 8), U0
+ mov (UP, UN, 8), U1
+ mov T, (QP, UN, 8)
+ add %rax, U0
+ adc %rdx, U1
+ sbb U2, U2
+ dec UN
+ mov U1, %rax
+ jz L(final)
+ mov $0, R32(Q1)
+
+ ALIGN(16)
+
+ C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
+ C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
+ C is zero, and carry holds an extra copy of U2.
+L(loop):
+ C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+ C Remains to add in B (U1 + c)
+ cmovc DINV, Q1
+ mov U2, Q2
+ neg Q2
+ mul DINV
+ add %rdx, Q1
+ adc $0, Q2
+ add Q0, Q1
+ mov %rax, Q0
+ mov B2, %rax
+ lea (B2md, U0), T
+ adc $0, Q2
+
+ C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+ mul U1
+ and B2, U2
+ add U2, U0
+ cmovnc U0, T
+
+ C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+ adc U1, Q1
+ mov -8(UP, UN, 8), U0
+ adc Q2, 8(QP, UN, 8)
+ jc L(q_incr)
+L(q_incr_done):
+ add %rax, U0
+ mov T, %rax
+ adc %rdx, %rax
+ mov Q1, (QP, UN, 8)
+ mov $0, R32(Q1)
+ sbb U2, U2
+ dec UN
+ mov %rax, U1
+ jnz L(loop)
+
+L(final):
+ pop D
+
+ mov U2, Q1
+ and D, U2
+ sub U2, %rax
+ neg Q1
+
+ mov %rax, U1
+ sub D, %rax
+ cmovc U1, %rax
+ sbb $-1, Q1
+
+ lea 1(%rax), T
+ mul DINV
+ add U0, %rax
+ adc T, %rdx
+ mov %rdx, T
+ imul D, %rdx
+ sub %rdx, U0
+ cmp U0, %rax
+ lea (U0, D), %rax
+ cmovnc U0, %rax
+ sbb $0, T
+ cmp D, %rax
+ jc L(div_done)
+ sub D, %rax
+ add $1, T
+L(div_done):
+ add T, Q0
+ mov Q0, (QP)
+ adc Q1, 8(QP)
+ jnc L(done)
+L(final_q_incr):
+ addq $1, 16(QP)
+ lea 8(QP), QP
+ jc L(final_q_incr)
+
+L(done):
+ pop %rbp
+ pop %rbx
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(q_incr):
+ C U1 is not live, so use it for indexing
+ lea 16(QP, UN, 8), U1
+L(q_incr_loop):
+ addq $1, (U1)
+ jnc L(q_incr_done)
+ lea 8(U1), U1
+ jmp L(q_incr_loop)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
new file mode 100644
index 0000000..d87cc3b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
@@ -0,0 +1,237 @@
+/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c) \
+ (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \
+ : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 115,768,433 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1_NORM_THRESHOLD 1
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 16
+
+#define DIV_1_VS_MUL_1_PERCENT 309
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 81
+#define MUL_TOOM44_THRESHOLD 232
+#define MUL_TOOM6H_THRESHOLD 324
+#define MUL_TOOM8H_THRESHOLD 478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 34
+#define SQR_TOOM3_THRESHOLD 114
+#define SQR_TOOM4_THRESHOLD 336
+#define SQR_TOOM6_THRESHOLD 430
+#define SQR_TOOM8_THRESHOLD 0 /* always */
+
+#define MULMID_TOOM42_THRESHOLD 36
+
+#define MULMOD_BNM1_THRESHOLD 17
+#define SQRMOD_BNM1_THRESHOLD 19
+
+#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \
+ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 44, 8}, { 23, 7}, { 47, 8}, \
+ { 25, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \
+ { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 53, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \
+ { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \
+ { 81, 9}, { 43,10}, { 23, 9}, { 55, 8}, \
+ { 111,10}, { 31, 9}, { 71,10}, { 39, 9}, \
+ { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \
+ { 111,11}, { 31,10}, { 63, 9}, { 131,10}, \
+ { 71, 9}, { 147,10}, { 87,11}, { 47,10}, \
+ { 111,11}, { 63,10}, { 143,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 199,11}, { 111,12}, \
+ { 63,11}, { 143,10}, { 287,11}, { 159,12}, \
+ { 95,11}, { 191,10}, { 383,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,11}, { 255,10}, \
+ { 511,11}, { 271,10}, { 543,11}, { 287,12}, \
+ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \
+ { 671,11}, { 351,12}, { 191,11}, { 415,12}, \
+ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \
+ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \
+ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \
+ { 415,11}, { 831,12}, { 447,11}, { 895,12}, \
+ { 479,14}, { 127,13}, { 255,12}, { 543,11}, \
+ { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \
+ { 319,12}, { 735,13}, { 383,12}, { 831,13}, \
+ { 447,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \
+ { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \
+ { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \
+ { 959,15}, { 255,14}, { 511,13}, { 1215,14}, \
+ { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \
+ { 895,13}, { 1855,15}, { 511,14}, { 1023,13}, \
+ { 2047,14}, { 1151,13}, { 2367,14}, { 1407,15}, \
+ { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \
+ { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \
+ { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \
+ { 4223,15}, { 2303,14}, { 4735,15}, { 2559,16}, \
+ { 1535,15}, { 3071,14}, { 6271,15}, { 3327,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD 11520
+
+#define SQR_FFT_MODF_THRESHOLD 540 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 540, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \
+ { 31, 7}, { 16, 6}, { 33, 7}, { 33, 8}, \
+ { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \
+ { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \
+ { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \
+ { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \
+ { 65, 9}, { 35, 8}, { 71, 9}, { 43,10}, \
+ { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \
+ { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \
+ { 55, 9}, { 111,11}, { 31,10}, { 63, 9}, \
+ { 127,10}, { 87,11}, { 47,10}, { 111,12}, \
+ { 31,11}, { 63,10}, { 143,11}, { 79,10}, \
+ { 167,11}, { 95,10}, { 191,11}, { 111,12}, \
+ { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \
+ { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \
+ { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \
+ { 415,13}, { 63,12}, { 127,10}, { 511, 9}, \
+ { 1023,11}, { 271,10}, { 543, 9}, { 1087,11}, \
+ { 287,10}, { 575,12}, { 159,11}, { 319,10}, \
+ { 639,11}, { 335,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,12}, { 223,11}, { 447,13}, \
+ { 127,11}, { 511,10}, { 1023,11}, { 543,10}, \
+ { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \
+ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \
+ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \
+ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \
+ { 447,11}, { 895,14}, { 127,12}, { 511,11}, \
+ { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \
+ { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \
+ { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \
+ { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \
+ { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \
+ { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \
+ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \
+ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \
+ { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \
+ { 959,14}, { 511,13}, { 1215,14}, { 639,13}, \
+ { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \
+ { 1791,15}, { 511,14}, { 1023,13}, { 2111,14}, \
+ { 1151,13}, { 2303,14}, { 1407,15}, { 767,14}, \
+ { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \
+ { 1279,14}, { 2687,15}, { 1535,14}, { 3199,15}, \
+ { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \
+ { 2303,14}, { 4863,15}, { 2559,16}, { 1535,15}, \
+ { 3071,14}, { 6271,15}, { 3327,17}, { 131072,18}, \
+ { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+ {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 202
+#define SQR_FFT_THRESHOLD 7296
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 61
+#define MULLO_MUL_N_THRESHOLD 22239
+#define SQRLO_BASECASE_THRESHOLD 8
+#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD 14281
+
+#define DC_DIV_QR_THRESHOLD 47
+#define DC_DIVAPPR_Q_THRESHOLD 266
+#define DC_BDIV_QR_THRESHOLD 38
+#define DC_BDIV_Q_THRESHOLD 104
+
+#define INV_MULMOD_BNM1_THRESHOLD 54
+#define INV_NEWTON_THRESHOLD 252
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 258
+#define REDC_1_TO_REDC_2_THRESHOLD 35
+#define REDC_2_TO_REDC_N_THRESHOLD 79
+
+#define MU_DIV_QR_THRESHOLD 2089
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 99
+#define MU_BDIV_QR_THRESHOLD 1787
+#define MU_BDIV_Q_THRESHOLD 1895
+
+#define POWM_SEC_TABLE 1,16,194,960,2825
+
+#define GET_STR_DC_THRESHOLD 16
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 248
+#define SET_STR_PRECOMPUTE_THRESHOLD 1747
+
+#define FAC_DSC_THRESHOLD 1240
+#define FAC_ODD_THRESHOLD 27
+
+#define MATRIX22_STRASSEN_THRESHOLD 21
+#define HGCD2_DIV1_METHOD 3 /* 4.10% faster than 5 */
+#define HGCD_THRESHOLD 141
+#define HGCD_APPR_THRESHOLD 181
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 622
+#define GCDEXT_DC_THRESHOLD 496
+#define JACOBI_BASE_METHOD 1 /* 0.97% faster than 3 */
+
+/* Tuneup completed successfully, took 131832 seconds */
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
new file mode 100644
index 0000000..ca2efb9
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
@@ -0,0 +1,469 @@
+dnl AMD64 mpn_mul_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
+
+dnl Copyright 2008, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.375
+C AMD K10 2.375
+C Intel P4 15-16
+C Intel core2 4.45
+C Intel corei 4.35
+C Intel atom ?
+C VIA nano 4.5
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C * Use fewer registers. (how??? I can't see it -- david)
+C * Avoid some "mov $0,r" and instead use "xor r,r".
+C * Can the top of each L(addmul_outer_n) prologue be folded into the
+C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
+C case where vn = 1 or 2; is it worth it?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp', `%rcx')
+define(`vn', `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n', `%r11')
+define(`outer_addr', `%r14')
+define(`un', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ xor R32(un), R32(un)
+ mov (up), %rax
+ mov (vp), v0
+
+ sub un_param, un C rdx used by mul
+ mov un, n
+ mov R32(un_param), R32(w0)
+
+ lea (rp,un_param,8), rp
+ lea (up,un_param,8), up
+
+ mul v0
+
+ test $1, R8(vn)
+ jz L(mul_2)
+
+C ===========================================================
+C mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+ and $3, R32(w0)
+ jz L(mul_1_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_1_prologue_1)
+ jz L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+ add $-1, n
+ lea L(addmul_outer_3)(%rip), outer_addr
+ mov %rax, w3
+ mov %rdx, w0
+ jmp L(mul_1_entry_3)
+
+L(mul_1_prologue_0):
+ mov %rax, w2
+ mov %rdx, w3 C note: already w0 == 0
+ lea L(addmul_outer_0)(%rip), outer_addr
+ jmp L(mul_1_entry_0)
+
+L(mul_1_prologue_1):
+ cmp $-1, un
+ jne 2f
+ mov %rax, -8(rp)
+ mov %rdx, (rp)
+ jmp L(ret)
+2: add $1, n
+ lea L(addmul_outer_1)(%rip), outer_addr
+ mov %rax, w1
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ mov (up,n,8), %rax
+ jmp L(mul_1_entry_1)
+
+L(mul_1_prologue_2):
+ add $-2, n
+ lea L(addmul_outer_2)(%rip), outer_addr
+ mov %rax, w0
+ mov %rdx, w1
+ mov 24(up,n,8), %rax
+ xor R32(w2), R32(w2)
+ xor R32(w3), R32(w3)
+ jmp L(mul_1_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+L(mul_1_entry_1):
+ xor R32(w0), R32(w0)
+ mul v0
+ mov w1, -8(rp,n,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov w2, (rp,n,8)
+ add %rax, w3
+ adc %rdx, w0
+L(mul_1_entry_3):
+ mov 16(up,n,8), %rax
+ mul v0
+ mov w3, 8(rp,n,8)
+ xor R32(w2), R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,n,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, n
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ adc %rdx, w2
+ mov w2, (rp)
+
+ add $-1, vn C vn -= 1
+ jz L(ret)
+
+ mov 8(vp), v0
+ mov 16(vp), v1
+
+ lea 8(vp), vp C vp += 1
+ lea 8(rp), rp C rp += 1
+
+ jmp *outer_addr
+
+C ===========================================================
+C mul_2 for vp[0], vp[1] if vn is even
+
+ ALIGN(16)
+L(mul_2):
+ mov 8(vp), v1
+
+ and $3, R32(w0)
+ jz L(mul_2_prologue_0)
+ cmp $2, R32(w0)
+ jz L(mul_2_prologue_2)
+ jc L(mul_2_prologue_1)
+
+L(mul_2_prologue_3):
+ lea L(addmul_outer_3)(%rip), outer_addr
+ add $2, n
+ mov %rax, -16(rp,n,8)
+ mov %rdx, w2
+ xor R32(w3), R32(w3)
+ xor R32(w0), R32(w0)
+ mov -16(up,n,8), %rax
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_prologue_0):
+ add $3, n
+ mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov -24(up,n,8), %rax
+ lea L(addmul_outer_0)(%rip), outer_addr
+ jmp L(mul_2_entry_0)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ mov %rax, w3
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_2):
+ add $1, n
+ lea L(addmul_outer_2)(%rip), outer_addr
+ mov $0, R32(w0)
+ mov $0, R32(w1)
+ mov %rax, w2
+ mov -8(up,n,8), %rax
+ mov %rdx, w3
+ jmp L(mul_2_entry_2)
+
+ C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -32(up,n,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,n,8), %rax
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+L(mul_2_entry_0):
+ mul v1
+ add %rax, w1
+ mov w0, -24(rp,n,8)
+ adc %rdx, w2
+ mov -16(up,n,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,n,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(rp,n,8)
+L(mul_2_entry_3):
+ mul v1
+ add %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ adc R32(w1), R32(w0) C adc $0, w0
+L(mul_2_entry_2):
+ mul v1
+ add %rax, w3
+ mov w2, -8(rp,n,8)
+ adc %rdx, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(mul_2_entry_1):
+ add $4, n
+ mov w3, -32(rp,n,8)
+ js L(mul_2_top)
+
+ mov -32(up,n,8), %rax C FIXME: n is constant
+ mul v1
+ add %rax, w0
+ mov w0, (rp)
+ adc %rdx, w1
+ mov w1, 8(rp)
+
+ add $-2, vn C vn -= 2
+ jz L(ret)
+
+ mov 16(vp), v0
+ mov 24(vp), v1
+
+ lea 16(vp), vp C vp += 2
+ lea 16(rp), rp C rp += 2
+
+ jmp *outer_addr
+
+
+C ===========================================================
+C addmul_2 for remaining vp's
+
+ C in the following prologues, we reuse un to store the
+ C adjusted value of n that is reloaded on each iteration
+
+L(addmul_outer_0):
+ add $3, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -24(up,un,8), %rax
+ mul v0
+ mov %rax, w0
+ mov -24(up,un,8), %rax
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(addmul_entry_0)
+
+L(addmul_outer_1):
+ mov un, n
+ mov (up,un,8), %rax
+ mul v0
+ mov %rax, w3
+ mov (up,un,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ jmp L(addmul_entry_1)
+
+L(addmul_outer_2):
+ add $1, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -8(up,un,8), %rax
+ mul v0
+ xor R32(w0), R32(w0)
+ mov %rax, w2
+ xor R32(w1), R32(w1)
+ mov %rdx, w3
+ mov -8(up,un,8), %rax
+ jmp L(addmul_entry_2)
+
+L(addmul_outer_3):
+ add $2, un
+ lea 0(%rip), outer_addr
+
+ mov un, n
+ mov -16(up,un,8), %rax
+ xor R32(w3), R32(w3)
+ mul v0
+ mov %rax, w1
+ mov -16(up,un,8), %rax
+ mov %rdx, w2
+ jmp L(addmul_entry_3)
+
+ C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
+
+ ALIGN(16)
+L(addmul_top):
+ add w3, -32(rp,n,8)
+ adc %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,n,8), %rax
+ adc %rdx, w1
+ adc R32(w2), R32(w2) C adc $0, w2
+L(addmul_entry_0):
+ mul v1
+ xor R32(w3), R32(w3)
+ add w0, -24(rp,n,8)
+ adc %rax, w1
+ mov -16(up,n,8), %rax
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov -16(up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(addmul_entry_3):
+ mul v1
+ add w1, -16(rp,n,8)
+ adc %rax, w2
+ mov -8(up,n,8), %rax
+ adc %rdx, w3
+ mul v0
+ xor R32(w0), R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov -8(up,n,8), %rax
+ adc R32(w1), R32(w0) C adc $0, w0
+L(addmul_entry_2):
+ mul v1
+ add w2, -8(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w3
+ mov (up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(addmul_entry_1):
+ mul v1
+ add $4, n
+ js L(addmul_top)
+
+ add w3, -8(rp)
+ adc %rax, w0
+ mov w0, (rp)
+ adc %rdx, w1
+ mov w1, 8(rp)
+
+ add $-2, vn C vn -= 2
+ jz L(ret)
+
+ lea 16(rp), rp C rp += 2
+ lea 16(vp), vp C vp += 2
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+ ALIGN(16)
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
new file mode 100644
index 0000000..fa00f42
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
@@ -0,0 +1,436 @@
+dnl AMD64 mpn_mullo_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C large trip count. Instead, we should start with mul_2 for any operand
+C size congruence class.
+C * Stop iterating addmul_2 earlier, falling into straight-line triangle code
+C for the last 2-3 iterations.
+C * Perhaps implement n=4 special code.
+C * The reload of the outer loop jump address hurts branch prediction.
+C * The addmul_2 loop ends with an MUL whose high part is not used upon loop
+C exit.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`vp', `%r11')
+define(`outer_addr', `%r8')
+define(`j', `%r9')
+define(`v0', `%r13')
+define(`v1', `%r14')
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mullo_basecase)
+ FUNC_ENTRY(4)
+ cmp $4, n
+ jge L(gen)
+ mov (up), %rax C u0
+ mov (vp_param), %r8 C v0
+
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',
+` movslq (%r9,%rcx,4), %r10
+ add %r10, %r9
+ jmp *%r9
+',`
+ jmp *(%r9,n,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(tab), L(tab)) C not allowed
+ JMPENT( L(1), L(tab)) C 1
+ JMPENT( L(2), L(tab)) C 2
+ JMPENT( L(3), L(tab)) C 3
+dnl JMPENT( L(0m4), L(tab)) C 4
+dnl JMPENT( L(1m4), L(tab)) C 5
+dnl JMPENT( L(2m4), L(tab)) C 6
+dnl JMPENT( L(3m4), L(tab)) C 7
+dnl JMPENT( L(0m4), L(tab)) C 8
+dnl JMPENT( L(1m4), L(tab)) C 9
+dnl JMPENT( L(2m4), L(tab)) C 10
+dnl JMPENT( L(3m4), L(tab)) C 11
+ TEXT
+
+L(1): imul %r8, %rax
+ mov %rax, (rp)
+ FUNC_EXIT()
+ ret
+
+L(2): mov 8(vp_param), %r11
+ imul %rax, %r11 C u0 x v1
+ mul %r8 C u0 x v0
+ mov %rax, (rp)
+ imul 8(up), %r8 C u1 x v0
+ lea (%r11, %rdx), %rax
+ add %r8, %rax
+ mov %rax, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(3): mov 8(vp_param), %r9 C v1
+ mov 16(vp_param), %r11
+ mul %r8 C u0 x v0 -> <r1,r0>
+ mov %rax, (rp) C r0
+ mov (up), %rax C u0
+ mov %rdx, %rcx C r1
+ mul %r9 C u0 x v1 -> <r2,r1>
+ imul 8(up), %r9 C u1 x v1 -> r2
+ mov 16(up), %r10
+ imul %r8, %r10 C u2 x v0 -> r2
+ add %rax, %rcx
+ adc %rdx, %r9
+ add %r10, %r9
+ mov 8(up), %rax C u1
+ mul %r8 C u1 x v0 -> <r2,r1>
+ add %rax, %rcx
+ adc %rdx, %r9
+ mov %r11, %rax
+ imul (up), %rax C u0 x v2 -> r2
+ add %rax, %r9
+ mov %rcx, 8(rp)
+ mov %r9, 16(rp)
+ FUNC_EXIT()
+ ret
+
+L(0m4):
+L(1m4):
+L(2m4):
+L(3m4):
+L(gen): push %rbx
+ push %rbp
+ push %r13
+ push %r14
+ push %r15
+
+ mov (up), %rax
+ mov (vp_param), v0
+ mov vp_param, vp
+
+ lea (rp,n,8), rp
+ lea (up,n,8), up
+ neg n
+
+ mul v0
+
+ test $1, R8(n)
+ jz L(mul_2)
+
+L(mul_1):
+ lea -8(rp), rp
+ lea -8(up), up
+ test $2, R8(n)
+ jnz L(mul_1_prologue_3)
+
+L(mul_1_prologue_2): C n = 7, 11, 15, ...
+ lea -1(n), j
+ lea L(addmul_outer_1)(%rip), outer_addr
+ mov %rax, w0
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ xor R32(w3), R32(w3)
+ mov 16(up,n,8), %rax
+ jmp L(mul_1_entry_2)
+
+L(mul_1_prologue_3): C n = 5, 9, 13, ...
+ lea 1(n), j
+ lea L(addmul_outer_3)(%rip), outer_addr
+ mov %rax, w2
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ jmp L(mul_1_entry_0)
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,j,8)
+ add %rax, w1
+ mov (up,j,8), %rax
+ adc %rdx, w2
+ xor R32(w0), R32(w0)
+ mul v0
+ mov w1, -8(rp,j,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,j,8), %rax
+ mul v0
+ mov w2, (rp,j,8)
+ add %rax, w3
+ adc %rdx, w0
+ mov 16(up,j,8), %rax
+ mul v0
+ mov w3, 8(rp,j,8)
+ xor R32(w2), R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,j,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, j
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ adc %rdx, w2
+
+ imul (up), v0
+ add v0, w2
+ mov w2, (rp)
+
+ add $1, n
+ jz L(ret)
+
+ mov 8(vp), v0
+ mov 16(vp), v1
+
+ lea 16(up), up
+ lea 8(vp), vp
+ lea 24(rp), rp
+
+ jmp *outer_addr
+
+
+L(mul_2):
+ mov 8(vp), v1
+ test $2, R8(n)
+ jz L(mul_2_prologue_3)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ lea 0(n), j
+ mov %rax, w3
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ mov (up,n,8), %rax
+ lea L(addmul_outer_3)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_3):
+ lea 2(n), j
+ mov $0, R32(w3)
+ mov %rax, w1
+ mov (up,n,8), %rax
+ mov %rdx, w2
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -32(up,j,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ mov w0, -24(rp,j,8)
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+L(mul_2_entry_3):
+ mov $0, R32(w0)
+ mov w1, -16(rp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc R32(w1), R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(rp,j,8)
+ adc %rdx, w0
+ mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(mul_2_entry_1):
+ add $4, j
+ mov w3, -32(rp,j,8)
+ js L(mul_2_top)
+
+ imul -16(up), v1
+ add v1, w0
+ imul -8(up), v0
+ add v0, w0
+ mov w0, -8(rp)
+
+ add $2, n
+ jz L(ret)
+
+ mov 16(vp), v0
+ mov 24(vp), v1
+
+ lea 16(vp), vp
+ lea 16(rp), rp
+
+ jmp *outer_addr
+
+
+L(addmul_outer_1):
+ lea -2(n), j
+ mov -16(up,n,8), %rax
+ mul v0
+ mov %rax, w3
+ mov -16(up,n,8), %rax
+ mov %rdx, w0
+ xor R32(w1), R32(w1)
+ lea L(addmul_outer_3)(%rip), outer_addr
+ jmp L(addmul_entry_1)
+
+L(addmul_outer_3):
+ lea 0(n), j
+ mov -16(up,n,8), %rax
+ xor R32(w3), R32(w3)
+ mul v0
+ mov %rax, w1
+ mov -16(up,n,8), %rax
+ mov %rdx, w2
+ lea L(addmul_outer_1)(%rip), outer_addr
+ jmp L(addmul_entry_3)
+
+ ALIGN(16)
+L(addmul_top):
+ add w3, -32(rp,j,8)
+ adc %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc R32(w2), R32(w2)
+ mul v1
+ xor R32(w3), R32(w3)
+ add w0, -24(rp,j,8)
+ adc %rax, w1
+ mov -16(up,j,8), %rax
+ adc %rdx, w2
+ mul v0
+ add %rax, w1
+ mov -16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+L(addmul_entry_3):
+ mul v1
+ add w1, -16(rp,j,8)
+ adc %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ xor R32(w0), R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov -8(up,j,8), %rax
+ adc R32(w1), R32(w0)
+ mul v1
+ add w2, -8(rp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov (up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+L(addmul_entry_1):
+ mul v1
+ add $4, j
+ js L(addmul_top)
+
+ add w3, -32(rp)
+ adc %rax, w0
+
+ imul -24(up), v0
+ add v0, w0
+ add w0, -24(rp)
+
+ add $2, n
+ jns L(ret)
+
+ lea 16(vp), vp
+
+ mov (vp), v0
+ mov 8(vp), v1
+
+ lea -16(up), up
+
+ jmp *outer_addr
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
new file mode 100644
index 0000000..86f1414
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
@@ -0,0 +1,559 @@
+dnl AMD64 mpn_mulmid_basecase
+
+dnl Contributed by David Harvey.
+
+dnl Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C K8,K9: 2.375 (2.5 when un - vn is "small")
+C K10: ?
+C P4: ?
+C P6-15: ?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn', `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n', `%r11')
+define(`outer_addr', `%r14')
+define(`un', `%r13')
+define(`vp', `%r15')
+
+define(`vp_inner', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mulmid_basecase)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov vp_param, vp
+
+ C use un for row length (= un_param - vn + 1)
+ lea 1(un_param), un
+ sub vn, un
+
+ lea (rp,un,8), rp
+
+ cmp $4, un C TODO: needs tuning
+ jc L(diagonal)
+
+ lea (up,un_param,8), up
+
+ test $1, vn
+ jz L(mul_2)
+
+C ===========================================================
+C mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+ mov R32(un), R32(w0)
+
+ neg un
+ mov (up,un,8), %rax
+ mov (vp), v0
+ mul v0
+
+ and $-4, un C round down to multiple of 4
+ mov un, n
+
+ and $3, R32(w0)
+ jz L(mul_1_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_1_prologue_1)
+ jz L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+ mov %rax, w3
+ mov %rdx, w0
+ lea L(addmul_prologue_3)(%rip), outer_addr
+ jmp L(mul_1_entry_3)
+
+ ALIGN(16)
+L(mul_1_prologue_0):
+ mov %rax, w2
+ mov %rdx, w3 C note already w0 == 0
+ lea L(addmul_prologue_0)(%rip), outer_addr
+ jmp L(mul_1_entry_0)
+
+ ALIGN(16)
+L(mul_1_prologue_1):
+ add $4, n
+ mov %rax, w1
+ mov %rdx, w2
+ mov $0, R32(w3)
+ mov (up,n,8), %rax
+ lea L(addmul_prologue_1)(%rip), outer_addr
+ jmp L(mul_1_entry_1)
+
+ ALIGN(16)
+L(mul_1_prologue_2):
+ mov %rax, w0
+ mov %rdx, w1
+ mov 24(up,n,8), %rax
+ mov $0, R32(w2)
+ mov $0, R32(w3)
+ lea L(addmul_prologue_2)(%rip), outer_addr
+ jmp L(mul_1_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8
+
+ ALIGN(16)
+L(mul_1_top):
+ mov w0, -16(rp,n,8)
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+L(mul_1_entry_1):
+ mov $0, R32(w0)
+ mul v0
+ mov w1, -8(rp,n,8)
+ add %rax, w2
+ adc %rdx, w3
+L(mul_1_entry_0):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov w2, (rp,n,8)
+ add %rax, w3
+ adc %rdx, w0
+L(mul_1_entry_3):
+ mov 16(up,n,8), %rax
+ mul v0
+ mov w3, 8(rp,n,8)
+ mov $0, R32(w2) C zero
+ mov w2, w3 C zero
+ add %rax, w0
+ mov 24(up,n,8), %rax
+ mov w2, w1 C zero
+ adc %rdx, w1
+L(mul_1_entry_2):
+ mul v0
+ add $4, n
+ js L(mul_1_top)
+
+ mov w0, -16(rp)
+ add %rax, w1
+ mov w1, -8(rp)
+ mov w2, 8(rp) C zero last limb of output
+ adc %rdx, w2
+ mov w2, (rp)
+
+ dec vn
+ jz L(ret)
+
+ lea -8(up), up
+ lea 8(vp), vp
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C mul_2 for vp[0], vp[1] if vn is even
+
+ ALIGN(16)
+L(mul_2):
+ mov R32(un), R32(w0)
+
+ neg un
+ mov -8(up,un,8), %rax
+ mov (vp), v0
+ mov 8(vp), v1
+ mul v1
+
+ and $-4, un C round down to multiple of 4
+ mov un, n
+
+ and $3, R32(w0)
+ jz L(mul_2_prologue_0)
+ cmp $2, R32(w0)
+ jc L(mul_2_prologue_1)
+ jz L(mul_2_prologue_2)
+
+L(mul_2_prologue_3):
+ mov %rax, w1
+ mov %rdx, w2
+ lea L(addmul_prologue_3)(%rip), outer_addr
+ jmp L(mul_2_entry_3)
+
+ ALIGN(16)
+L(mul_2_prologue_0):
+ mov %rax, w0
+ mov %rdx, w1
+ lea L(addmul_prologue_0)(%rip), outer_addr
+ jmp L(mul_2_entry_0)
+
+ ALIGN(16)
+L(mul_2_prologue_1):
+ mov %rax, w3
+ mov %rdx, w0
+ mov $0, R32(w1)
+ lea L(addmul_prologue_1)(%rip), outer_addr
+ jmp L(mul_2_entry_1)
+
+ ALIGN(16)
+L(mul_2_prologue_2):
+ mov %rax, w2
+ mov %rdx, w3
+ mov $0, R32(w0)
+ mov 16(up,n,8), %rax
+ lea L(addmul_prologue_2)(%rip), outer_addr
+ jmp L(mul_2_entry_2)
+
+
+ C this loop is 18 c/loop = 2.25 c/l on K8
+
+ ALIGN(16)
+L(mul_2_top):
+ mov -8(up,n,8), %rax
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+L(mul_2_entry_0):
+ mov $0, R32(w2)
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w0
+ mov (up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add %rax, w1
+ mov w0, (rp,n,8)
+ adc %rdx, w2
+L(mul_2_entry_3):
+ mov 8(up,n,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov $0, R32(w0)
+ adc $0, R32(w3)
+ mov 8(up,n,8), %rax
+ mov w1, 8(rp,n,8)
+ mul v1
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+L(mul_2_entry_2):
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov 16(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, 16(rp,n,8)
+ adc %rdx, w0
+L(mul_2_entry_1):
+ mov 24(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, n
+ mov w3, -8(rp,n,8)
+ jnz L(mul_2_top)
+
+ mov w0, (rp)
+ mov w1, 8(rp)
+
+ sub $2, vn
+ jz L(ret)
+
+ lea 16(vp), vp
+ lea -16(up), up
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C addmul_2 for remaining vp's
+
+ ALIGN(16)
+L(addmul_prologue_0):
+ mov -8(up,n,8), %rax
+ mul v1
+ mov %rax, w1
+ mov %rdx, w2
+ mov $0, R32(w3)
+ jmp L(addmul_entry_0)
+
+ ALIGN(16)
+L(addmul_prologue_1):
+ mov 16(up,n,8), %rax
+ mul v1
+ mov %rax, w0
+ mov %rdx, w1
+ mov $0, R32(w2)
+ mov 24(up,n,8), %rax
+ jmp L(addmul_entry_1)
+
+ ALIGN(16)
+L(addmul_prologue_2):
+ mov 8(up,n,8), %rax
+ mul v1
+ mov %rax, w3
+ mov %rdx, w0
+ mov $0, R32(w1)
+ jmp L(addmul_entry_2)
+
+ ALIGN(16)
+L(addmul_prologue_3):
+ mov (up,n,8), %rax
+ mul v1
+ mov %rax, w2
+ mov %rdx, w3
+ mov $0, R32(w0)
+ mov $0, R32(w1)
+ jmp L(addmul_entry_3)
+
+ C this loop is 19 c/loop = 2.375 c/l on K8
+
+ ALIGN(16)
+L(addmul_top):
+ mov $0, R32(w3)
+ add %rax, w0
+ mov -8(up,n,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1
+ add w0, -8(rp,n,8)
+ adc %rax, w1
+ adc %rdx, w2
+L(addmul_entry_0):
+ mov (up,n,8), %rax
+ mul v0
+ add %rax, w1
+ mov (up,n,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1
+ add w1, (rp,n,8)
+ mov $0, R32(w1)
+ adc %rax, w2
+ mov $0, R32(w0)
+ adc %rdx, w3
+L(addmul_entry_3):
+ mov 8(up,n,8), %rax
+ mul v0
+ add %rax, w2
+ mov 8(up,n,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add w2, 8(rp,n,8)
+ adc %rax, w3
+ adc %rdx, w0
+L(addmul_entry_2):
+ mov 16(up,n,8), %rax
+ mul v0
+ add %rax, w3
+ mov 16(up,n,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add w3, 16(rp,n,8)
+ nop C don't ask...
+ adc %rax, w0
+ mov $0, R32(w2)
+ mov 24(up,n,8), %rax
+ adc %rdx, w1
+L(addmul_entry_1):
+ mul v0
+ add $4, n
+ jnz L(addmul_top)
+
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, R32(w2)
+
+ add w0, -8(rp)
+ adc w1, (rp)
+ adc w2, 8(rp)
+
+ sub $2, vn
+ jz L(ret)
+
+ lea 16(vp), vp
+ lea -16(up), up
+
+ mov un, n
+ mov (vp), v0
+ mov 8(vp), v1
+
+ jmp *outer_addr
+
+C ===========================================================
+C accumulate along diagonals if un - vn is small
+
+ ALIGN(16)
+L(diagonal):
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ xor R32(w2), R32(w2)
+
+ neg un
+
+ mov R32(vn), %eax
+ and $3, %eax
+ jz L(diag_prologue_0)
+ cmp $2, %eax
+ jc L(diag_prologue_1)
+ jz L(diag_prologue_2)
+
+L(diag_prologue_3):
+ lea -8(vp), vp
+ mov vp, vp_inner
+ add $1, vn
+ mov vn, n
+ lea L(diag_entry_3)(%rip), outer_addr
+ jmp L(diag_entry_3)
+
+L(diag_prologue_0):
+ mov vp, vp_inner
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov -8(up,n,8), %rax
+ jmp L(diag_entry_0)
+
+L(diag_prologue_1):
+ lea 8(vp), vp
+ mov vp, vp_inner
+ add $3, vn
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov -8(vp_inner), %rax
+ jmp L(diag_entry_1)
+
+L(diag_prologue_2):
+ lea -16(vp), vp
+ mov vp, vp_inner
+ add $2, vn
+ mov vn, n
+ lea 0(%rip), outer_addr
+ mov 16(vp_inner), %rax
+ jmp L(diag_entry_2)
+
+
+ C this loop is 10 c/loop = 2.5 c/l on K8
+
+ ALIGN(16)
+L(diag_top):
+ add %rax, w0
+ adc %rdx, w1
+ mov -8(up,n,8), %rax
+ adc $0, w2
+L(diag_entry_0):
+ mulq (vp_inner)
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, w2
+L(diag_entry_3):
+ mov -16(up,n,8), %rax
+ mulq 8(vp_inner)
+ add %rax, w0
+ mov 16(vp_inner), %rax
+ adc %rdx, w1
+ adc $0, w2
+L(diag_entry_2):
+ mulq -24(up,n,8)
+ add %rax, w0
+ mov 24(vp_inner), %rax
+ adc %rdx, w1
+ lea 32(vp_inner), vp_inner
+ adc $0, w2
+L(diag_entry_1):
+ mulq -32(up,n,8)
+ sub $4, n
+ jnz L(diag_top)
+
+ add %rax, w0
+ adc %rdx, w1
+ adc $0, w2
+
+ mov w0, (rp,un,8)
+
+ inc un
+ jz L(diag_end)
+
+ mov vn, n
+ mov vp, vp_inner
+
+ lea 8(up), up
+ mov w1, w0
+ mov w2, w1
+ xor R32(w2), R32(w2)
+
+ jmp *outer_addr
+
+L(diag_end):
+ mov w1, (rp)
+ mov w2, 8(rp)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
new file mode 100644
index 0000000..9327b21
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
@@ -0,0 +1,591 @@
+dnl X86-64 mpn_redc_1 optimised for AMD K8-K10.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 ?
+C AMD K10 ?
+C AMD bull ?
+C AMD pile ?
+C AMD steam ?
+C AMD bobcat ?
+C AMD jaguar ?
+C Intel P4 ?
+C Intel core ?
+C Intel NHM ?
+C Intel SBR ?
+C Intel IBR ?
+C Intel HWL ?
+C Intel BWL ?
+C Intel atom ?
+C VIA nano ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C * Micro-optimise, none performed thus far.
+C * This looks different from other current redc_1.asm variants. Consider
+C adapting this to the mainstream style.
+C * Is this code really faster than more approaches which compute q0 later?
+C Is the use of a jump jump table faster? Or is the edge of this due to the
+C inlined add_n code?
+C * Put initial m[0] x q0 computation in header.
+C * Put basecases at the file's end, single them out before the pushes.
+
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`mp_param', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`u0inv', `%r8') C stack
+
+define(`i', `%r11')
+define(`nneg', `%r12')
+define(`mp', `%r13')
+define(`q0', `%rbp')
+define(`vp', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %rbp
+ mov (up), q0 C up[0]
+ push %rbx
+ imul u0inv, q0 C first q0, for all execution paths
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov n, nneg
+ neg nneg
+ lea (mp_param,n,8), mp C mp += n
+ lea -16(up,n,8), up C up += n
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ lea 4(%rax), %r9
+ cmp $4, R32(n)
+ cmovg %r9, %rax
+ lea L(tab)(%rip), %r9
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(0), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(0m4), L(tab))
+ JMPENT( L(1m4), L(tab))
+ JMPENT( L(2m4), L(tab))
+ JMPENT( L(3m4), L(tab))
+ TEXT
+
+ ALIGN(16)
+L(1): mov (mp_param), %rax
+ mul q0
+ add 8(up), %rax
+ adc 16(up), %rdx
+ mov %rdx, (rp)
+ mov $0, R32(%rax)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+ ALIGN(16)
+L(2): mov (mp_param), %rax
+ mul q0
+ xor R32(%r14), R32(%r14)
+ mov %rax, %r10
+ mov -8(mp), %rax
+ mov %rdx, %r9
+ mul q0
+ add (up), %r10
+ adc %rax, %r9
+ adc %rdx, %r14
+ add 8(up), %r9
+ adc $0, %r14
+ mov %r9, q0
+ imul u0inv, q0
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov -8(mp), %rax
+ mov %rdx, %r11
+ mul q0
+ add %r9, %r10
+ adc %rax, %r11
+ adc %rdx, %rbx
+ add 16(up), %r11
+ adc $0, %rbx
+ xor R32(%rax), R32(%rax)
+ add %r11, %r14
+ adc 24(up), %rbx
+ mov %r14, (rp)
+ mov %rbx, 8(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+L(3): mov (mp_param), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add -8(up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add (up), %r10
+ mov %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ mov %r10, q0
+ imul u0inv, q0
+ add %r9, 8(up)
+ adc $0, %r14
+ mov %r14, -8(up)
+
+ mov -24(mp), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add (up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add 8(up), %r10
+ mov %r10, 8(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ mov %r10, q0
+ imul u0inv, q0
+ add %r9, 16(up)
+ adc $0, %r14
+ mov %r14, (up)
+
+ mov -24(mp), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov -16(mp), %rax
+ mul q0
+ xor R32(%r9), R32(%r9)
+ xor R32(%r14), R32(%r14)
+ add 8(up), %rbx
+ adc %rax, %r10
+ mov -8(mp), %rax
+ adc %rdx, %r9
+ mul q0
+ add 16(up), %r10
+ adc %rax, %r9
+ adc %rdx, %r14
+ add 24(up), %r9
+ adc $0, %r14
+
+ xor R32(%rax), R32(%rax)
+ add -8(up), %r10
+ adc (up), %r9
+ adc 32(up), %r14
+ mov %r10, (rp)
+ mov %r9, 8(rp)
+ mov %r14, 16(rp)
+ adc R32(%rax), R32(%rax)
+ jmp L(ret)
+
+
+ ALIGN(16)
+L(2m4):
+L(lo2): mov (mp,nneg,8), %rax
+ mul q0
+ xor R32(%r14), R32(%r14)
+ xor R32(%rbx), R32(%rbx)
+ mov %rax, %r10
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mov %rdx, %r9
+ mul q0
+ add 16(up,nneg,8), %r10
+ adc %rax, %r9
+ mov 16(mp,nneg,8), %rax
+ adc %rdx, %r14
+ mul q0
+ mov $0, R32(%r10) C xor?
+ lea 2(nneg), i
+ add %r9, %r15
+ imul u0inv, %r15
+ jmp L(e2)
+
+ ALIGN(16)
+L(li2): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+L(e2): add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li2)
+
+L(le2): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo2)
+
+ mov nneg, n
+ sar $2, n
+ lea 32(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -16(up), %r8
+ mov -8(up), %r9
+ add -16(vp), %r8
+ adc -8(vp), %r9
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ lea 16(rp), rp
+ jmp L(addx)
+
+
+ ALIGN(16)
+L(1m4):
+L(lo1): mov (mp,nneg,8), %rax
+ xor %r9, %r9
+ xor R32(%rbx), R32(%rbx)
+ mul q0
+ mov %rax, %r9
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mov %rdx, %r14
+ mov $0, R32(%r10) C xor?
+ mul q0
+ add 16(up,nneg,8), %r9
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 16(mp,nneg,8), %rax
+ mul q0
+ lea 1(nneg), i
+ add %r14, %r15
+ imul u0inv, %r15
+ jmp L(e1)
+
+ ALIGN(16)
+L(li1): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+L(e1): add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li1)
+
+L(le1): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo1)
+
+ mov nneg, n
+ sar $2, n
+ lea 24(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -8(up), %r8
+ add -8(vp), %r8
+ mov %r8, (rp)
+ lea 8(rp), rp
+ jmp L(addx)
+
+
+ ALIGN(16)
+L(0):
+L(0m4):
+L(lo0): mov (mp,nneg,8), %rax
+ mov nneg, i
+ mul q0
+ xor R32(%r10), R32(%r10)
+ mov %rax, %r14
+ mov %rdx, %rbx
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mul q0
+ add 16(up,nneg,8), %r14
+ adc %rax, %rbx
+ adc %rdx, %r10
+ add %rbx, %r15
+ imul u0inv, %r15
+ jmp L(e0)
+
+ ALIGN(16)
+L(li0): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+L(e0): mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li0)
+
+L(le0): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ add $8, up
+ mov %r15, q0
+ dec n
+ jnz L(lo0)
+
+ mov nneg, n
+ sar $2, n
+ clc
+ lea 16(up,nneg,8), up
+ lea (up,nneg,8), vp
+ jmp L(addy)
+
+
+ ALIGN(16)
+L(3m4):
+L(lo3): mov (mp,nneg,8), %rax
+ mul q0
+ mov %rax, %rbx
+ mov %rdx, %r10
+ mov 8(mp,nneg,8), %rax
+ mov 24(up,nneg,8), %r15
+ mul q0
+ add 16(up,nneg,8), %rbx C result is zero, might carry
+ mov $0, R32(%rbx) C zero
+ mov %rbx, %r14 C zero
+ adc %rax, %r10
+ mov 16(mp,nneg,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ add %r10, %r15
+ mul q0
+ lea 3(nneg), i
+ imul u0inv, %r15
+C jmp L(li3)
+
+ ALIGN(16)
+L(li3): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor R32(%r10), R32(%r10)
+ mul q0
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul q0
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul q0
+ add %rbx, 24(up,i,8)
+ mov $0, R32(%r14) C zero
+ mov %r14, %rbx C zero
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %r14, %r9 C zero
+ adc %rdx, %r9
+ mul q0
+ add $4, i
+ js L(li3)
+
+L(le3): add %r10, (up)
+ adc %rax, %r9
+ adc %r14, %rdx
+ add %r9, 8(up)
+ adc $0, %rdx
+ mov %rdx, 16(up,nneg,8) C up[0]
+ mov %r15, q0
+ lea 8(up), up
+ dec n
+ jnz L(lo3)
+
+
+C ==== Addition code ====
+ mov nneg, n
+ sar $2, n
+ lea 40(up,nneg,8), up
+ lea (up,nneg,8), vp
+
+ mov -24(up), %r8
+ mov -16(up), %r9
+ mov -8(up), %r10
+ add -24(vp), %r8
+ adc -16(vp), %r9
+ adc -8(vp), %r10
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ lea 24(rp), rp
+
+L(addx):inc n
+ jz L(ad3)
+
+L(addy):mov (up), %r8
+ mov 8(up), %r9
+ inc n
+ jmp L(mid)
+
+C ALIGN(16)
+L(al3): adc (vp), %r8
+ adc 8(vp), %r9
+ adc 16(vp), %r10
+ adc 24(vp), %r11
+ mov %r8, (rp)
+ lea 32(up), up
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ inc n
+ mov %r11, 24(rp)
+ lea 32(vp), vp
+ mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+L(mid): mov 16(up), %r10
+ mov 24(up), %r11
+ jnz L(al3)
+
+L(ae3): adc (vp), %r8
+ adc 8(vp), %r9
+ adc 16(vp), %r10
+ adc 24(vp), %r11
+ mov %r8, (rp)
+ mov %r9, 8(rp)
+ mov %r10, 16(rp)
+ mov %r11, 24(rp)
+
+L(ad3): mov R32(n), R32(%rax) C zero
+ adc R32(%rax), R32(%rax)
+
+L(ret): pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
new file mode 100644
index 0000000..60cf945
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
@@ -0,0 +1,807 @@
+dnl AMD64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C large trip count. Instead, we should follow the generic/sqr_basecase.c
+C code which uses addmul_2s from the start, conditionally leaving a 1x1
+C multiply to the end. (In assembly code, one would stop invoking
+C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
+C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
+C save/restore carry, instead it can propagate into the high product word.
+C * Align more labels, should shave off a few cycles.
+C * We can safely use 32-bit size operations, since operands with (2^32)
+C limbs will lead to non-termination in practice.
+C * The jump table could probably be optimized, at least for non-pic.
+C * The special code for n <= 4 was quickly written. It is probably too
+C large and unnecessarily slow.
+C * Consider combining small cases code so that the n=k-1 code jumps into the
+C middle of the n=k code.
+C * Avoid saving registers for small cases code.
+C * Needed variables:
+C n r11 input size
+C i r8 work left, initially n
+C j r9 inner loop count
+C r15 unused
+C v0 r13
+C v1 r14
+C rp rdi
+C up rsi
+C w0 rbx
+C w1 rcx
+C w2 rbp
+C w3 r10
+C tp r12
+C lo rax
+C hi rdx
+C rsp
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+
+define(`n', `%r11')
+define(`tp', `%r12')
+define(`i', `%r8')
+define(`j', `%r9')
+define(`v0', `%r13')
+define(`v1', `%r14')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ FUNC_ENTRY(3)
+ mov R32(n_param), R32(%rcx)
+ mov R32(n_param), R32(n) C free original n register (rdx)
+
+ add $-40, %rsp
+
+ and $3, R32(%rcx)
+ cmp $4, R32(n_param)
+ lea 4(%rcx), %r8
+
+ mov %rbx, 32(%rsp)
+ mov %rbp, 24(%rsp)
+ mov %r12, 16(%rsp)
+ mov %r13, 8(%rsp)
+ mov %r14, (%rsp)
+
+ cmovg %r8, %rcx
+
+ lea L(tab)(%rip), %rax
+ifdef(`PIC',
+` movslq (%rax,%rcx,4), %r10
+ add %r10, %rax
+ jmp *%rax
+',`
+ jmp *(%rax,%rcx,8)
+')
+ JUMPTABSECT
+ ALIGN(8)
+L(tab): JMPENT( L(4), L(tab))
+ JMPENT( L(1), L(tab))
+ JMPENT( L(2), L(tab))
+ JMPENT( L(3), L(tab))
+ JMPENT( L(0m4), L(tab))
+ JMPENT( L(1m4), L(tab))
+ JMPENT( L(2m4), L(tab))
+ JMPENT( L(3m4), L(tab))
+ TEXT
+
+L(1): mov (up), %rax
+ mul %rax
+ add $40, %rsp
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ FUNC_EXIT()
+ ret
+
+L(2): mov (up), %rax
+ mov %rax, %r8
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ add $40, %rsp
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc %r8, %r11
+ mov %r11, 24(rp)
+ FUNC_EXIT()
+ ret
+
+L(3): mov (up), %rax
+ mov %rax, %r10
+ mul %rax
+ mov 8(up), %r11
+ mov %rax, (rp)
+ mov %r11, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov 16(up), %rcx
+ mov %rax, 16(rp)
+ mov %rcx, %rax
+ mov %rdx, 24(rp)
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add $40, %rsp
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %rdx, 32(rp)
+ adc %r11, 40(rp)
+ FUNC_EXIT()
+ ret
+
+L(4): mov (up), %rax
+ mov %rax, %r11
+ mul %rax
+ mov 8(up), %rbx
+ mov %rax, (rp)
+ mov %rbx, %rax
+ mov %rdx, 8(rp)
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+ mov 24(up), %rax
+ mul %rax
+ mov %rax, 48(rp)
+ mov %rbx, %rax
+ mov %rdx, 56(rp)
+
+ mul %r11
+ add $32, %rsp
+ mov %rax, %r8
+ mov %rdx, %r9
+ mov 16(up), %rax
+ mul %r11
+ xor %r10, %r10
+ add %rax, %r9
+ adc %rdx, %r10
+ mov 24(up), %rax
+ mul %r11
+ xor %r11, %r11
+ add %rax, %r10
+ adc %rdx, %r11
+ mov 16(up), %rax
+ mul %rbx
+ xor %rcx, %rcx
+ add %rax, %r10
+ adc %rdx, %r11
+ adc $0, %rcx
+ mov 24(up), %rax
+ mul %rbx
+ pop %rbx
+ add %rax, %r11
+ adc %rdx, %rcx
+ mov 16(up), %rdx
+ mov 24(up), %rax
+ mul %rdx
+ add %rax, %rcx
+ adc $0, %rdx
+
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %r11, %r11
+ adc %rcx, %rcx
+ mov $0, R32(%rax)
+ adc %rdx, %rdx
+
+ adc %rax, %rax
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %r11, 32(rp)
+ adc %rcx, 40(rp)
+ adc %rdx, 48(rp)
+ adc %rax, 56(rp)
+ FUNC_EXIT()
+ ret
+
+
+L(0m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0
+ mov 8(up), %rax
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -4(n), i
+C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
+ xor R32(j), R32(j)
+ sub n, j
+
+ mul v0
+ xor R32(w2), R32(w2)
+ mov %rax, w0
+ mov 16(up,j,8), %rax
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(mul_1_m3_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+ xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+L(L3): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m3_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+ lea -8(up), up
+ jmp L(dowhile)
+
+
+L(1m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0 C u0
+ mov 8(up), %rax C u1
+ lea 8(up,n,8), up C point up at end of input operand
+
+ lea -3(n), i
+C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
+ lea -3(n), j
+ neg j
+
+ mov %rax, v1 C u1
+ mul v0 C u0 * u1
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov %rax, 8(rp)
+ jmp L(m0)
+
+ ALIGN(16)
+L(mul_2_m0_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+L(m0): mov -16(up,j,8), %rax C u2, u6 ...
+ mul v0 C u0 * u2
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+L(m2x): mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m0_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -16(up), up
+ lea eval(3*8-24)(tp), tp C tp += 3
+ jmp L(dowhile_end)
+
+
+L(2m4):
+ lea -16(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0
+ mov 8(up), %rax
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -4(n), i
+C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
+ lea -2(n), j
+ neg j
+
+ mul v0
+ mov %rax, w2
+ mov (up,j,8), %rax
+ mov %rdx, w1
+ jmp L(L1)
+
+ ALIGN(16)
+L(mul_1_m1_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+L(L1): xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+ xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m1_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+ lea -8(up), up
+ jmp L(dowhile_mid)
+
+
+L(3m4):
+ lea 8(rp,n,8), tp C point tp in middle of result operand
+ mov (up), v0 C u0
+ mov 8(up), %rax C u1
+ lea 8(up,n,8), up C point up at end of input operand
+
+ lea -5(n), i
+C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
+ lea -1(n), j
+ neg j
+
+ mov %rax, v1 C u1
+ mul v0 C u0 * u1
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov %rax, 8(rp)
+ jmp L(m2)
+
+ ALIGN(16)
+L(mul_2_m2_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+L(m2): mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m2_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -16(up), up
+ jmp L(dowhile_mid)
+
+L(dowhile):
+C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
+ lea 4(i), j
+ neg j
+
+ mov 16(up,j,8), v0
+ mov 24(up,j,8), v1
+ mov 24(up,j,8), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, 24(tp,j,8)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ jmp L(am2)
+
+ ALIGN(16)
+L(addmul_2_m2_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+ mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+L(am2): mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m2_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+
+ add $-2, R32(i) C i -= 2
+
+L(dowhile_mid):
+C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
+ lea 2(i), j
+ neg j
+
+ mov (up,j,8), v0
+ mov 8(up,j,8), v1
+ mov 8(up,j,8), %rax
+ mul v0
+ xor R32(w1), R32(w1)
+ add %rax, 8(tp,j,8)
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(20)
+
+ ALIGN(16)
+L(addmul_2_m0_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+L(20): mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m0_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+L(dowhile_end):
+
+ add $-2, R32(i) C i -= 2
+ jne L(dowhile)
+
+C Function mpn_addmul_2s_2
+ mov -16(up), v0
+ mov -8(up), v1
+ mov -8(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, -8(tp)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov (up), %rax
+ mul v0
+ add %rax, w3
+ mov (up), %rax
+ adc %rdx, w0
+ mul v1
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+C Function mpn_sqr_diag_addlsh1
+ lea -4(n,n), j
+
+ mov 8(rp), %r11
+ lea -8(up), up
+ lea (rp,j,8), rp
+ neg j
+ mov (up,j,4), %rax
+ mul %rax
+ test $2, R8(j)
+ jnz L(odd)
+
+L(evn): add %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ add %rdx, %r11
+ mov %rax, (rp,j,8)
+ jmp L(d0)
+
+L(odd): add %r11, %r11
+ sbb R32(%rbp), R32(%rbp) C save CF
+ add %rdx, %r11
+ mov %rax, (rp,j,8)
+ lea -2(j), j
+ jmp L(d1)
+
+ ALIGN(16)
+L(top): mov (up,j,4), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp,j,8)
+L(d0): mov %r11, 8(rp,j,8)
+ mov 16(rp,j,8), %r10
+ adc %r10, %r10
+ mov 24(rp,j,8), %r11
+ adc %r11, %r11
+ nop
+ sbb R32(%rbp), R32(%rbp) C save CF
+ mov 8(up,j,4), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, 16(rp,j,8)
+L(d1): mov %r11, 24(rp,j,8)
+ mov 32(rp,j,8), %r10
+ adc %r10, %r10
+ mov 40(rp,j,8), %r11
+ adc %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ add $4, j
+ js L(top)
+
+ mov (up), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp)
+ mov %r11, 8(rp)
+ mov 16(rp), %r10
+ adc %r10, %r10
+ sbb R32(%rbp), R32(%rbp) C save CF
+ neg R32(%rbp)
+ mov 8(up), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rbp, %rdx
+ mov %r10, 16(rp)
+ mov %rdx, 24(rp)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ FUNC_EXIT()
+ ret
+EPILOGUE()