diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/core2 |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/core2')
28 files changed, 5914 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm new file mode 100644 index 0000000..7066bb4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm new file mode 100644 index 0000000..5065120 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm new file mode 100644 index 0000000..57abf31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/coreinhm/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm new file mode 100644 index 0000000..3f875ae --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm @@ -0,0 +1,225 @@ +dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 4.14 +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp', `%r8') +define(`n', `%r9') +define(`cy_param', `8(%rsp)') + +define(`el', `%rbx') +define(`eh', `%rbp') +define(`t0', `%r10') +define(`t1', `%r11') +define(`t2', `%r12') +define(`t3', `%r13') +define(`w0', `%r14') +define(`w1', `%r15') + +ifdef(`OPERATION_add_err1_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err1_n)') +ifdef(`OPERATION_sub_err1_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err1_n)') + +MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + + mov R32(n), R32(%r10) + and $3, R32(%r10) + jz L(0mod4) + cmp $2, R32(%r10) + jc L(1mod4) + jz L(2mod4) +L(3mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + xor R32(t1), R32(t1) + lea -24(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 16(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc 8(yp), t0 + mov 16(up,n,8), w0 + ADCSBB 16(vp,n,8), w0 + mov w0, 16(rp,n,8) + cmovc (yp), t1 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + + add $3, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(0mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea (yp,n,8), yp + neg n + jmp L(loop) + + ALIGN(16) +L(1mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea -8(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc (yp), el + setc %al C save carry + + add $1, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(2mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + lea -16(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 8(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc (yp), t0 + setc %al C save carry + add t0, el + adc $0, eh + + add $2, n + jnz L(loop) + jmp L(end) + + ALIGN(32) +L(loop): + mov (up,n,8), w0 + shr $1, %al C restore carry + mov -8(yp), t0 + mov $0, R32(t3) + ADCSBB (vp,n,8), w0 + cmovnc t3, t0 + mov w0, (rp,n,8) + mov 8(up,n,8), w1 + mov 16(up,n,8), w0 + ADCSBB 8(vp,n,8), w1 + mov -16(yp), t1 + cmovnc t3, t1 + mov -24(yp), t2 + mov w1, 8(rp,n,8) + ADCSBB 16(vp,n,8), w0 + cmovnc t3, t2 + mov 24(up,n,8), w1 + ADCSBB 24(vp,n,8), w1 + cmovc -32(yp), t3 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + add t2, el + adc $0, eh + lea -32(yp), yp + mov w0, 16(rp,n,8) + add t3, el + adc $0, eh + add $4, n + mov w1, -8(rp,n,8) + jnz L(loop) + +L(end): + mov el, (ep) + mov eh, 8(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm new file mode 100644 index 0000000..f9e0039 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm @@ -0,0 +1,150 @@ +dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem. + +dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 1.93\2 +C AMD bull 1.62\2.1 +C AMD pile 1.6\1.7 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.54 +C Intel P4 10 +C Intel core2 2 +C Intel NHM 2 +C Intel SBR 2 +C Intel IBR 1.95 +C Intel HWL 1.72 +C Intel BWL 1.54 +C Intel SKL 1.52 +C Intel atom 9 +C Intel SLM 6.5 +C VIA nano 3 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(start): + mov (up), %r10 + mov (vp), %r11 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + je L(b00) + add %rax, n C clear low rcx bits for jrcxz + cmp $2, R32(%rax) + jl L(b01) + je L(b10) + +L(b11): neg %r8 C set cy + jmp L(e11) + +L(b00): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + + nop + nop + nop +L(b01): neg %r8 C set cy + jmp L(top) + +L(b10): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, -8(rp) + mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0 + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(top): jrcxz L(end) + mov (up,n,8), %r8 + mov (vp,n,8), %r9 + lea 4(n), n + ADCSBB %r11, %r10 + mov %r10, -40(rp,n,8) +L(e00): mov -24(up,n,8), %r10 + mov -24(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -32(rp,n,8) +L(e11): mov -16(up,n,8), %r8 + mov -16(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e10): mov -8(up,n,8), %r10 + mov -8(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) + jmp L(top) +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(start) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm new file mode 100644 index 0000000..a7a5d6e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm @@ -0,0 +1,188 @@ +dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". + +dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 +C AMD K10 4.01 +C AMD bull 4.98 +C AMD pile 4.83 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 +C AMD jaguar 5.54 +C Intel P4 16.3 17.3 +C Intel core2 4.32 4.61 +C Intel NHM 5.08 +C Intel SBR 4.04 +C Intel IBR 3.95 +C Intel HWL 3.66 +C Intel BWL 2.87 +C Intel SKL 2.79 +C Intel atom 20.6 +C Intel SLM 7.6 +C VIA nano 5.25 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`v0', `%rcx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') + define(`func_1c', `mpn_addmul_1c') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') + define(`func_1c', `mpn_submul_1c') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + + C For DOS, on the stack we have four saved registers, return address, + C space for four register arguments, and finally the carry input. + +IFDOS(` define(`carry_in', `72(%rsp)')') dnl +IFSTD(` define(`carry_in', `%r8')') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_1c) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + add carry_in, %rax + adc $0, %rdx + jmp L(start_nc) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + +L(start_nc): + test $1, R8(%rbx) + jnz L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jz L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + + ALIGN(8) +L(odd): inc %rbx + jz L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + ALIGN(16) +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc %rbx, %rbp C rbx = 0 +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc %rbx, %rdx C rbx = 0 +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov R32(%rbx), R32(%rax) C rbx = 0 + adc %rdx, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/com.asm b/gmp-6.3.0/mpn/x86_64/core2/com.asm new file mode 100644 index 0000000..d7d9f79 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyd.asm b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm new file mode 100644 index 0000000..57ea0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyi.asm b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm new file mode 100644 index 0000000..f0c7607 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm new file mode 100644 index 0000000..1b3f139 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm @@ -0,0 +1,243 @@ +dnl x86-64 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C norm unorm frac +C AMD K8,K9 15 15 12 +C AMD K10 15 15 12 +C Intel P4 44 44 43 +C Intel core2 24 24 19.5 +C Intel corei 19 19 18 +C Intel atom 51 51 36 +C VIA nano 46 44 22.5 + +C mp_limb_t +C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d) + +C mp_limb_t +C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d, +C mp_limb_t dinv, int cnt) + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn_param', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`d', `%r8') +define(`dinv', `%r9') C only for mpn_preinv_divrem_1 +C shift passed on stack C only for mpn_preinv_divrem_1 + +define(`cnt', `%rcx') +define(`up', `%rsi') +define(`fn', `%r12') +define(`un', `%rbx') + + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C cnt qp d dinv + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFSTD(`define(`CNTOFF', `40($1)')') +IFDOS(`define(`CNTOFF', `104($1)')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + + lea -8(qp,un_param,8), qp + + mov CNTOFF(%rsp), R8(cnt) + shl R8(cnt), d + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + je L(ret) + + lea -8(qp,un_param,8), qp + xor R32(%rbp), R32(%rbp) + +L(unnormalized): + test un, un + je L(44) + mov -8(up,un,8), %rax + cmp d, %rax + jae L(44) + mov %rbp, (qp) + mov %rax, %rbp + lea -8(qp), qp + je L(ret) + dec un +L(44): + bsr d, %rcx + not R32(%rcx) + sal R8(%rcx), d + sal R8(%rcx), %rbp + + push %rcx +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` sub $8, %rsp ') +IFSTD(` mov d, %rdi ') +IFDOS(` sub $40, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + pop %rcx + + mov %rax, dinv + mov %rbp, %rax + test un, un + je L(frac) + +L(ent): mov -8(up,un,8), %rbp + shr R8(%rcx), %rax + shld R8(%rcx), %rbp, %rax + sub $2, un + js L(end) + + ALIGN(16) +L(top): lea 1(%rax), %r11 + mul dinv + mov (up,un,8), %r10 + shld R8(%rcx), %r10, %rbp + mov %rbp, %r13 + add %rax, %r13 + adc %r11, %rdx + mov %rdx, %r11 + imul d, %rdx + sub %rdx, %rbp + lea (d,%rbp), %rax + sub $8, qp + cmp %r13, %rbp + cmovc %rbp, %rax + adc $-1, %r11 + cmp d, %rax + jae L(ufx) +L(uok): dec un + mov %r11, 8(qp) + mov %r10, %rbp + jns L(top) + +L(end): lea 1(%rax), %r11 + sal R8(%rcx), %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(efx) +L(eok): mov %r13, (qp) + sub $8, qp + jmp L(frac) + +L(ufx): sub d, %rax + inc %r11 + jmp L(uok) +L(efx): sub d, %rax + inc %r13 + jmp L(eok) + +L(frac):mov d, %rbp + neg %rbp + jmp L(fent) + + ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 +L(ftop):mul dinv C 0,12 0,17 0,17 + add %r11, %rdx C 5 8 10 + mov %rax, %r11 C 4 8 3 + mov %rdx, %r13 C 6 9 11 + imul %rbp, %rdx C 6 9 11 + mov d, %rax C + add %rdx, %rax C 10 14 14 + cmp %r11, %rdx C 10 14 14 + cmovc %rdx, %rax C 11 15 15 + adc $-1, %r13 C + mov %r13, (qp) C + sub $8, qp C +L(fent):lea 1(%rax), %r11 C + dec fn C + jns L(ftop) C + + shr R8(%rcx), %rax +L(ret): pop %rbx + pop %rbp + pop %r12 + pop %r13 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm new file mode 100644 index 0000000..b00451f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm @@ -0,0 +1,93 @@ +dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 4.22 * +C Intel PNR 4.22 * +C Intel NHM 4.97 +C Intel WSM 5.17 +C Intel SBR 4.83 * +C Intel IBR 4.16 * +C Intel HWL 3.84 +C Intel BWL 3.76 +C Intel SKL 3.83 +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + jmp L(odd) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 +L(odd): mov v0, %rdx + sub u0, %rdx C v - u + bsf %rdx, %rcx + mov u0, %rax + sub v0, u0 C u - v + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm new file mode 100644 index 0000000..b5aa73b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm @@ -0,0 +1,137 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 8.7 +C Intel PNR 8.7 +C Intel NHM 9.2 +C Intel WSM 9.2 +C Intel SBR 9.1 +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shrd R8(cnt), u1, u0 + shr R8(cnt), u1 + + mov v1, t1 + or u1, t1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h new file mode 100644 index 0000000..44f1494 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3000 MHz Penryn */ +/* FFT tuning limit = 116,220,984 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 16 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 26 + +#define DIV_1_VS_MUL_1_PERCENT 284 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 184 +#define MUL_TOOM6H_THRESHOLD 256 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 102 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 366 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 368, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31, 9}, \ + { 255,10}, { 135,11}, { 79,10}, { 159, 9}, \ + { 319,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 799,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2559,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4991,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 176 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,11}, { 79,10}, \ + { 159, 6}, { 2559, 7}, { 1343, 6}, { 2687, 7}, \ + { 1407, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1663,14}, { 895,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,16}, { 511,15}, { 1023,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,12}, { 11775,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 183 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 67 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 11 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 53 +#define DC_DIVAPPR_Q_THRESHOLD 163 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 158 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 248 +#define REDC_1_TO_REDC_N_THRESHOLD 44 + +#define MU_DIV_QR_THRESHOLD 1187 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,64,105,579,1486 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 134 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD2_DIV1_METHOD 3 /* 2.14% faster than 5 */ +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 161 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 351 +#define JACOBI_BASE_METHOD 4 /* 3.56% faster than 1 */ + +/* Tuneup completed successfully, took 132491 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm new file mode 100644 index 0000000..ded7b67 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm @@ -0,0 +1,210 @@ +dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 n/a +C Intel CNR 4.50 y +C Intel PNR 3.28 y +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + movq (vp), %xmm10 + add $8, vp + pxor %xmm10, %xmm1 + jmp L(e1) + +L(2): add $-48, up + add $-48, vp + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + movq (vp), %xmm10 + add $-40, vp + pxor %xmm10, %xmm1 + jmp L(e3) + +L(4): add $-32, up + add $-32, vp + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + movq (vp), %xmm10 + add $-24, vp + pxor %xmm10, %xmm1 + jmp L(e5) + +L(6): add $-16, up + add $-16, vp + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + movq (vp), %xmm10 + add $-8, vp + pxor %xmm10, %xmm1 + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 + lddqu (vp), %xmm10 + pxor %xmm10, %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 + lddqu 16(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 + lddqu 32(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up + lddqu 48(vp), %xmm10 + add $64, vp + pxor %xmm10, %xmm1 +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm new file mode 100644 index 0000000..5ff174c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm @@ -0,0 +1,285 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 +C AMD K10 1.52 1.75 1.75 n +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.79 = +C AMD bt2 2.15 2.65 2.65 n +C AMD zen 1.5 1.5 1.5 = +C Intel P4 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.5 1.5 y +C Intel IBR 1.47 1.48 1.48 y +C Intel HWL 1.11 1.35 1.35 y +C Intel BWL 1.09 1.30 1.30 y +C Intel SKL 1.21 1.27 1.27 y +C Intel atom 3.31 3.57 3.57 y +C Intel SLM 3.0 3.0 3.0 = +C VIA nano + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + +define(`addptr', `lea $1($2), $2') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 + not %r8 +L(b00): mov 8(vp), %r9 + not %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 + not %r8 +L(e10): mov 24(vp), %r9 + not %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + not %r8 + LOGOP 8(up), %r9 + not %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + not %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + not %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshift.asm b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm new file mode 100644 index 0000000..9016a71 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm @@ -0,0 +1,145 @@ +dnl x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld R8(cnt), %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld R8(cnt), %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld R8(cnt), %r11, %r10 + mov -24(up), %r9 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm new file mode 100644 index 0000000..c428f13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm @@ -0,0 +1,159 @@ +dnl x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.52 +C Intel NHM 1.78 (just 2.15 for n < 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +C TODO +C * This runs poorly on Nehalem compared to plain lshift, in particular for +C n < 256. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + shld R8(cnt), %r11, %r10 + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %r9 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + shld R8(cnt), %r9, %r8 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + not %r11 + shld R8(cnt), %r9, %r8 + mov %r11, (rp) +L(10): mov -8(up), %r11 + not %r8 + shld R8(cnt), %r10, %r9 + mov %r8, -8(rp) +L(01): mov -16(up), %r8 + not %r9 + shld R8(cnt), %r11, %r10 + mov %r9, -16(rp) +L(00): mov -24(up), %r9 + not %r10 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + not %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + not %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + not %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm new file mode 100644 index 0000000..d16be85 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm @@ -0,0 +1,975 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.0 - 4.18-4.25 +C Intel NHM 3.75 3.8 - 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4) +C | | | | +C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) | +C | / | / | / | / +C | / | / | / | / +C | / | / | / | / +C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_ +C _____ _____ _____ _____ +C / \ / \ / \ / \ +C \|/ | \|/ | \|/ | \|/ | +C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) | +C \ /|\ \ /|\ \ /|\ \ /|\ +C \_____/ \_____/ \_____/ \_____/ + +C TODO +C * Tune. None done so far. +C * Currently 2687 bytes, making it smaller would be nice. +C * Implement some basecases, say for un < 4. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance! +define(`vn_param', `%r8') + +define(`un', `%r9') +define(`vn', `(%rsp)') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`i', `%r13') +define(`vp', `%r14') + +define(`X0', `%r8') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + mov (up), %rax C shared for mul_1 and mul_2 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (vp_param), v0 C shared for mul_1 and mul_2 + + xor un, un + sub un_param, un C un = -un_param + + lea (up,un_param,8), up + lea (rp,un_param,8), rp + + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn_param) + jz L(m2) + + lea 8(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):test $2, R8(un) + jnz L(m1s2) + +L(m1s0): + lea (un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + lea L(do_am0)(%rip), %rbp + jmp L(m1e0) + +L(m1s2): + lea 2(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mul v0 + lea L(do_am2)(%rip), %rbp + test i, i + jnz L(m1e2) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),8(rp,un,8)) + mov %rdx, I((rp),16(rp,un,8)) + jmp L(ret2) + +L(m1x1):test $2, R8(un) + jz L(m1s3) + +L(m1s1): + lea 1(un), i + mov %rax, (rp,un,8) + test i, i + jz L(1) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am1)(%rip), %rbp + jmp L(m1e1) +L(1): mov %rdx, I((rp),8(rp,un,8)) + jmp L(ret2) + +L(m1s3): + lea -1(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am3)(%rip), %rbp + jmp L(m1e3) + + ALIGNx +L(m1top): + mul v0 + mov w1, -16(rp,i,8) +L(m1e2):xor R32(w1), R32(w1) + add %rax, w0 + mov (up,i,8), %rax + adc %rdx, w1 + mov w0, -8(rp,i,8) +L(m1e1):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 8(up,i,8), %rax + adc %rdx, w0 + mov w1, (rp,i,8) +L(m1e0):xor R32(w1), R32(w1) + mul v0 + add %rax, w0 + mov 16(up,i,8), %rax + adc %rdx, w1 + mov w0, 8(rp,i,8) +L(m1e3):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 24(up,i,8), %rax + adc %rdx, w0 + add $4, i + js L(m1top) + + mul v0 + mov w1, I(-16(rp),-16(rp,i,8)) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),-8(rp,i,8)) + mov %rdx, I((rp),(rp,i,8)) + + dec vn_param + jz L(ret2) + lea -8(rp), rp + jmp *%rbp + +L(m2): + mov 8(vp_param), v1 + lea 16(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(bx1) + +L(bx0): test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), i + mov %rax, (rp,un,8) + mov %rdx, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov $0, R32(w2) + jmp L(m2e0) + +L(b10): lea -2(un), i + mov %rax, w2 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w3 C FIXME: Use lea? + mov $0, R32(w0) + jmp L(m2e2) + +L(bx1): test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), i + mov %rax, (rp,un,8) + mov (up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mov $0, R32(w1) + jmp L(m2e1) + +L(b11): lea -1(un), i + mov %rax, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w2 C FIXME: Use lea? + mov $0, R32(w3) + jmp L(m2e3) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am0): + push %r15 + push vn_param + +L(olo0): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax +C lea 0(un), i + mov un, i + mul v0 + mov %rax, X0 + mov (up,un,8), %rax + MOV( %rdx, X1, 2) + mul v1 + MOV( %rdx, w0, 4) + mov (rp,un,8), w2 + mov %rax, w3 + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo0) + +L(ret): pop %rax + pop %r15 +L(ret2):pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am1): + push %r15 + push vn_param + +L(olo1): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea 1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (up,un,8), %rax + mov (rp,un,8), w1 + mul v1 + mov %rax, w2 + mov 8(up,un,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo1) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top2): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am2): + push %r15 + push vn_param + +L(olo2): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -2(un), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (up,un,8), %rax + mov (rp,un,8), w0 + mul v1 + mov %rax, w1 + lea (%rdx), w2 + mov 8(up,un,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo2) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am3): + push %r15 + push vn_param + +L(olo3): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,un,8), %rax + mov (rp,un,8), w3 + mul v1 + mov %rax, w0 + MOV( %rdx, w1, 16) + mov 8(up,un,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo3) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm new file mode 100644 index 0000000..0f03d86 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm @@ -0,0 +1,427 @@ +dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.18-4.25 +C Intel NHM 3.75 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`n', `%r9') +define(`i', `%r13') +define(`vp', `%r8') + +define(`X0', `%r14') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n_param + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n_param,8), rp C point rp at R[un] + push %rbp + lea (up,n_param,8), up C point up right after U's end + push %r12 + mov $0, R32(n) C FIXME + sub n_param, n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n_param) + jnz L(m2x1) + +L(m2x0):test $2, R8(n_param) + jnz L(m2b2) + +L(m2b0):lea (n), i + mov %rax, (rp,n,8) + mov %rdx, w1 + mov (up,n,8), %rax + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(m2b2):lea -2(n), i + mov %rax, w2 + mov (up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(m2e2) + +L(m2x1):test $2, R8(n_param) + jnz L(m2b3) + +L(m2b1):lea 1(n), i + mov %rax, (rp,n,8) + mov (up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(m2b3):lea -1(n), i + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + jmp L(m2e3) + + ALIGNx +L(m2tp):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) C FIXME: dead in last iteration + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 C FIXME: dead in last iteration + add $4, i + js L(m2tp) + +L(m2ed):imul v0, %rax + add w3, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea (n), i + mov (rp,n,8), w3 + mov %rax, w0 + MOV( %rdx, w1, 16) + jmp L(lo0) + +L(a110):lea 2(n), i + mov (rp,n,8), w1 + mov %rax, w2 + mov 8(up,n,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo2) + +L(a1x1):mov %rax, X0 + MOV( %rdx, X1, 2) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 1(n), i + MOV( %rdx, w0, 4) + mov (rp,n,8), w2 + mov %rax, w3 + jmp L(lo1) + +L(a111):lea -1(n), i + MOV( %rdx, w2, 64) + mov %rax, w1 + mov (rp,n,8), w0 + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(top): mul v1 + add w0, w1 + adc %rax, w2 + mov -8(up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov -8(up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov -8(rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, -8(rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov (up,i,8), %rax + mov (rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, (rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 8(rp,i,8), w3 + adc $0, X1 + mov 8(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 16(up,i,8), %rax + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 16(up,i,8), %rax + mov 16(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add w0, w1 + adc %rax, w2 + mov I(-8(up),-8(up,i,8)), %rax + imul v0, %rax + add w1, X1 + mov X1, I(-16(rp),-16(rp,i,8)) + adc X0, %rax + mov I(-8(rp),-8(rp,i,8)), w1 + add w1, w2 + add w2, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %rbx + imul v0, %rbx + mov -16(up), %rcx + imul v1, %rcx + mov %rax, -16(rp) + add %rbx, %rcx + add %rdx, %rcx + mov %rcx, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n_param + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/popcount.asm b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm new file mode 100644 index 0000000..3de69d8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm @@ -0,0 +1,185 @@ +dnl AMD64 SSSE3 mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.79-1.91 n +C AMD bd2 1.73-1.85 n +C AMD bd3 ? +C AMD bd4 1.73-1.85 n +C AMD zen 1.47 n +C AMD bobcat 8.0 n +C AMD jaguar 4.78 n +C Intel P4 n/a +C Intel CNR 3.75 +C Intel PNR 2.61 y +C Intel NHM 2.03 n +C Intel SBR 1.87 n +C Intel IBR 1.52-1.58 n +C Intel HWL 1.52-1.58 n +C Intel BWL 1.52-1.58 n +C Intel SKL 1.51 n +C Intel atom 12.3 n +C Intel SLM 9.1 n +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`n', `%rsi') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + jmp L(e1) + +L(2): add $-48, up + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + jmp L(e3) + +L(4): add $-32, up + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + jmp L(e5) + +L(6): add $-16, up + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm new file mode 100644 index 0000000..8c296fd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm @@ -0,0 +1,430 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core 4.5 (fluctuating) +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Keep up[i] in registers for basecases (might require pushes). + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C X q0' n X rp up u0i mp q0 i j + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea -16(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(b0) + +L(b1): cmp $-1, R32(n) + jz L(n1) + cmp $-3, R32(n) + jz L(n3) + + push rp + +L(otp1):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %rbp + mov 8(mp,n,8), %rax + lea (%rdx), %r9 + mul q0 + lea (%rax), %r11 + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %rdi + mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov 24(mp,n,8), %rax + adc %r9, %r11 + mov 24(up,n,8), %rbx + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %rbx + lea (%rax), %r11 + mov 32(mp,n,8), %rax + adc %rdi, %rbp + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + imul u0inv, %rbx C next q limb + add $2, i + jns L(ed1) + + ALIGNx +L(tp1): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp1) + +L(ed1): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b0): cmp $-2, R32(n) + jz L(n2) + cmp $-4, R32(n) + jz L(n4) + + push rp + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %r11 + mov 8(mp,n,8), %rax + lea (%rdx), %rdi + mul q0 + lea (%rax), %rbp + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 24(mp,n,8), %rax + adc %rdi, %rbp + mov 24(up,n,8), %rbx + lea (%rdx), %rdi + adc $0, %r9 + mul q0 + add %rbp, %rbx + lea (%rax), %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi +L(e0): mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp0) + +L(ed0): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + +L(cj): lea 16(up), up C FIXME + pop rp +L(add_n): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(n1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 16(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov 8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, (up) + add %r9, %r10 + adc $0, %r11 + mov %r10, 8(up) + mov %r11, -8(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -32(up), %rdx + mov -24(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc 8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n4): mov -32(mp), %rax + mul q0 + lea (%rax), %r11 + mov -24(mp), %rax + lea (%rdx), %r14 + mul q0 + lea (%rax), %rbp + mov -16(mp), %rax + mov -16(up), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov -8(mp), %rax + adc %r14, %rbp + mov -8(up), %rbx + lea (%rdx), %r14 + adc $0, %r9 + mul q0 + add %rbp, %rbx + adc %r9, %r11 + mov %rbx, -8(up) + mov (up), %r10 + adc $0, %r14 + imul u0inv, %rbx C next q limb + add %r11, %r10 + adc %r14, %rax + mov %r10, (up) + mov 8(up), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, 8(up) + adc $0, %rdx + mov %rdx, -16(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(n4) + lea 16(up), up + jmp L(add_n) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm new file mode 100644 index 0000000..27eed37 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm @@ -0,0 +1,169 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn. + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.05 +C Intel NHM 3.3 +C Intel SBR 2.5 +C Intel atom ? +C VIA nano ? + +C TODO +C * Loopmix to approach 2.5 c/l on NHM. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %r8 + ADCSBB (vp), %r8 + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %r8 + ADDSUB (vp), %r8 +L(ent): sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %rax + and $1, R32(%rax) C return value + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rbp) + neg n + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(n1) + +L(b1): mov %r8, %rbp + inc n + js L(top) + jmp L(end) + +L(n1): jnz L(b3) + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r10 + add $-2, n + jmp L(2) + +L(b3): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r10 + mov 16(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r10 + ADCSBB 16(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r9 + dec n + jmp L(3) + +L(b0): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + jmp L(4) + + ALIGN(16) + +L(top): add R32(%rbx), R32(%rbx) C restore cy + mov (up,n,8), %r8 + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB (vp,n,8), %r8 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + shrd $1, %r8, %rbp + mov %rbp, -8(rp,n,8) +L(4): shrd $1, %r9, %r8 + mov %r8, (rp,n,8) +L(3): shrd $1, %r10, %r9 + mov %r9, 8(rp,n,8) +L(2): shrd $1, %r11, %r10 + mov %r10, 16(rp,n,8) +L(1): add $4, n + mov %r11, %rbp + js L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, -8(rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rshift.asm b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm new file mode 100644 index 0000000..7578a53 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm @@ -0,0 +1,143 @@ +dnl x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea 8(up), up + lea -24(rp), rp + mov -8(up), %r10 + mov (up), %r11 + shrd R8(cnt), %r10, %rax + mov 8(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea 16(up), up + lea -16(rp), rp + mov -16(up), %r9 + shrd R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov -8(up), %r10 + mov (up), %r11 + jmp L(01) + +L(b10): lea 24(up), up + lea -8(rp), rp + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov -8(up), %r10 + jmp L(10) + +L(b11): lea 32(up), up + mov -32(up), %r11 + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shrd R8(cnt), %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd R8(cnt), %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd R8(cnt), %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd R8(cnt), %r11, %r10 + mov 16(up), %r9 + add $32, up + mov %r10, 24(rp) + add $32, rp + dec n + jnz L(top) + +L(end): shrd R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shrd R8(cnt), %r9, %r8 + mov %r8, 8(rp) +L(1): shr R8(cnt), %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm new file mode 100644 index 0000000..a112c1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm @@ -0,0 +1,984 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.9 4.18-4.25 3.87 +C Intel NHM 3.8 4.06-4.2 3.5 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C tail(0m2) tail(1m2) +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Tune. None done so far. +C * Currently 2761 bytes, making it smaller would be nice. +C * Consider using a jumptab-based entry sequence. One might even use a mask- +C less sequence, if the table is large enough to support tuneup's needs. +C The code would be, using non-PIC code, +C lea tab(%rip),%rax; jmp *(n,%rax) +C or, +C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx +C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,.. +C with the last four entries repeated a safe number of times. +C * Consider expanding feed-in code in order to avoid zeroing registers. +C * Zero consistently with xor. +C * Check if using "lea (reg),reg" should be done in more places; we have some +C explicit "mov %rax,reg" now. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r8') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r9') +define(`i', `%r13') + +define(`X0', `%r12') +define(`X1', `%r14') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $4, n_param + jl L(small) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (up), v0 + mov 8(up), %rax + mov %rax, v1 + + mov $1, R32(n) + sub n_param, n C n = -n_param+1 + push n + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + mov %rax, (rp,n,8) + jnz L(b10) + +L(b00): lea (n), i C n = 5, 9, ... + mov %rdx, w1 C FIXME: Use lea? + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(b10): lea 2(n), i C n = 7, 11, ... + mov 8(up,n,8), %rax + mov %rdx, w3 C FIXME: Use lea? + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(m2e2) + +L(bx1): test $2, R8(n) + mov %rax, (rp,n,8) + jz L(b11) + +L(b01): lea 1(n), i C n = 6, 10, ... + mov %rdx, w0 C FIXME: Use lea? + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(b11): lea -1(n), i C n = 4, 8, 12, ... + mov %rdx, w2 C FIXME: Use lea? + xor R32(w3), R32(w3) + jmp L(m2e3) + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 +L(m2e1):mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o3) + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 +L(m2e3):mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-1, n + jz L(cor1) C jumps iff entry n = 4 + +L(am2o1): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (rp,n,8), w1 + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + xor R32(w3), R32(w3) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o3): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (rp,n,8), w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-1, n + jnz L(am2o1) + +L(cor1):pop n + mov %rdx, w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 + add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(sqr_diag_addlsh1) + + ALIGNx +L(m2top2): +L(m2e2):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o0) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 +L(m2e0):mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-2, n + jz L(cor2) C jumps iff entry n = 5 + +L(am2o2): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -2(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (rp,n,8), w0 + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o0): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 0(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 2) + xor R32(w0), R32(w0) + mov (rp,n,8), w2 + xor R32(w3), R32(w3) + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-2, n + jnz L(am2o2) + +L(cor2):pop n + mov -24(up), v0 + mov %rax, w2 + mov %rdx, w0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov -8(up), %rax + mul v0 + add w2, X0 + mov X0, -16(rp) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov -8(up), %rax + adc $0, X0 + mul v1 + add w0, X1 + adc $0, X0 + mov X1, -8(rp) + add X0, %rax + mov %rax, (rp) + adc $0, %rdx + mov %rdx, 8(rp) + lea 8(rp), rp + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + xor R32(%rbx), R32(%rbx) + mul %rax + mov 8(rp,n,8), %r11 + lea (%rdx), %r10 + mov 16(rp,n,8), %r9 + add %r11, %r11 + jmp L(dm) + + ALIGNx +L(dtop):mul %rax + add %r11, %r10 + mov 8(rp,n,8), %r11 + mov %r10, -8(rp,n,8) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov 16(rp,n,8), %r9 + adc %r11, %r11 +L(dm): mov %rax, (rp,n,8) + mov (up,n,4), %rax + adc %r9, %r9 + setc R8(%rbx) + add $2, n + js L(dtop) + + mul %rax + add %r11, %r10 + mov %r10, -8(rp) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov %rax, (rp) + adc $0, %r10 + mov %r10, 8(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + mov (up), %rax + cmp $2, n_param + jae L(gt1) +L(n1): + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) +L(n2): mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): +L(n3): mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm new file mode 100644 index 0000000..46488fc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm new file mode 100644 index 0000000..f3b1e28 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm new file mode 100644 index 0000000..272700d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm @@ -0,0 +1,158 @@ +dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and +dnl Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C AMD K8,K9 4.25 +C AMD K10 ? +C Intel P4 ? +C Intel core2 3 +C Intel NHM 3.1 +C Intel SBR 2.47 +C Intel atom ? +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %r12 + + mov R32(%rcx), R32(%rax) + lea 24(up,n,8), up + lea 24(vp,n,8), vp + lea 24(rp,n,8), rp + neg n + + xor R32(%r11), R32(%r11) + + mov -24(vp,n,8), %r8 C do first limb early + shrd $RSH, %r8, %r11 + + and $3, R32(%rax) + je L(b0) + cmp $2, R32(%rax) + jc L(b1) + je L(b2) + +L(b3): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + mov %r10, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $3, n + js L(top) + jmp L(end) + +L(b1): mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov %r8, %r11 + sbb R32(%rax), R32(%rax) C save cy + inc n + js L(top) + jmp L(end) + +L(b2): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov %r9, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $2, n + js L(top) + jmp L(end) + + ALIGN(16) +L(top): mov -24(vp,n,8), %r8 + shrd $RSH, %r8, %r11 +L(b0): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov (vp,n,8), %rbx + shrd $RSH, %rbx, %r10 + + add R32(%rax), R32(%rax) C restore cy + + mov -24(up,n,8), %r12 + ADCSBB %r11, %r12 + mov %r12, -24(rp,n,8) + + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + + mov (up,n,8), %r12 + ADCSBB %r10, %r12 + mov %r12, (rp,n,8) + + mov %rbx, %r11 + sbb R32(%rax), R32(%rax) C save cy + + add $4, n + js L(top) + +L(end): shr $RSH, %r11 + pop %r12 + pop %rbx + sub R32(%r11), R32(%rax) + neg R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() |