diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/coreisbr |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreisbr')
23 files changed, 4354 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm new file mode 100644 index 0000000..21f0bf4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm @@ -0,0 +1,224 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.93 this +C Intel IBR 2.66 this +C Intel HWL 2.5 2.15 +C Intel BWL +C Intel atom +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`X0', `%r12') +define(`X1', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + + lea (up,n_param,8), up + lea 8(rp,n_param,8), rp + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): mov -8(rp,n,8), X0 + mov %rdx, w1 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + xor w0, w0 + xor w3, w3 + test $2, R8(n) + jnz L(b10) + +L(b00): nop C this nop make loop go faster on SBR! + mul v1 + mov (rp,n,8), X1 + jmp L(lo0) + +L(b10): lea -2(n), n + jmp L(lo2) + +L(bx1): mov -8(rp,n,8), X1 + mov %rdx, w3 + add %rax, X1 + adc $0, w3 + mov (up,n,8), %rax + xor w1, w1 + xor w2, w2 + test $2, R8(n) + jz L(b11) + +L(b01): mov (rp,n,8), X0 + inc n + jmp L(lo1) + +L(b11): dec n + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo1): mul v1 + mov %rdx, w0 C 1 + add %rax, X0 C 0 + adc $0, w0 C 1 + add w1, X1 C 3 + adc $0, w3 C 0 + add w2, X0 C 0 + adc $0, w0 C 1 + mov (up,n,8), %rax + mul v0 + add %rax, X0 C 0 + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mov (up,n,8), %rax + mul v1 + mov X1, -16(rp,n,8) C 3 + mov (rp,n,8), X1 C 1 + add w3, X0 C 0 + adc $0, w1 C 1 +L(lo0): mov %rdx, w2 C 2 + mov X0, -8(rp,n,8) C 0 + add %rax, X1 C 1 + adc $0, w2 C 2 + mov 8(up,n,8), %rax + add w0, X1 C 1 + adc $0, w2 C 2 + mul v0 + add %rax, X1 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov 8(up,n,8), %rax +L(lo3): mul v1 + add w1, X1 C 1 + mov 8(rp,n,8), X0 C 2 + adc $0, w3 C 2 + mov %rdx, w0 C 3 + add %rax, X0 C 2 + adc $0, w0 C 3 + mov 16(up,n,8), %rax + mul v0 + add w2, X0 C 2 + mov X1, (rp,n,8) C 1 + mov %rdx, w1 C 3 + adc $0, w0 C 3 + add %rax, X0 C 2 + adc $0, w1 C 3 + mov 16(up,n,8), %rax + add w3, X0 C 2 + adc $0, w1 C 3 +L(lo2): mul v1 + mov 16(rp,n,8), X1 C 3 + add %rax, X1 C 3 + mov %rdx, w2 C 4 + adc $0, w2 C 4 + mov 24(up,n,8), %rax + mov X0, 8(rp,n,8) C 2 + mul v0 + add w0, X1 C 3 + mov %rdx, w3 C 4 + adc $0, w2 C 4 + add %rax, X1 C 3 + mov 24(up,n,8), %rax + mov 24(rp,n,8), X0 C 0 useless but harmless final read + adc $0, w3 C 4 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-16(rp),-16(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I(-8(rp),-8(rp,n,8)) + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm new file mode 100644 index 0000000..2319a80 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm @@ -0,0 +1,54 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm new file mode 100644 index 0000000..3b7bb22 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm @@ -0,0 +1,56 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh2_nc +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm new file mode 100644 index 0000000..23ace41 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) +dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] + +dnl Copyright 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.25 +C Intel NHM 4 +C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32)) +C Intel atom ? +C VIA nano ? + +C This code probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). It also runs reasonably well on Core 2, but it runs poorly on +C all other processors, including Nehalem. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov cy, %rax + neg %rax C set msb on carry + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%r9) + and $3, R32(%r9) + je L(b00) + cmp $2, R32(%r9) + jc L(b01) + je L(b10) + jmp L(b11) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, %rbp + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $3, n + ja L(top) + jmp L(end) + +L(b01): add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + mov %rbp, (rp) + mov %r8, %rbp + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $1, n + ja L(top) + jmp L(end) + +L(b10): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, %rbp + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $2, n + ja L(top) + jmp L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + shrd $RSH, %r8, %rbp +L(b00): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + mov 24(vp), %r11 + shrd $RSH, %r11, %r10 + lea 32(vp), vp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, 24(rp) + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $4, n + jnz L(top) + +L(end): shr $RSH, %rbp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm new file mode 100644 index 0000000..db8ee68 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k) +dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[] +dnl Optimised for Sandy Bridge. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 5.25 +C Intel P4 ? +C Intel core2 3.1 +C Intel NHM 3.95 +C Intel SBR 2.75 +C Intel atom ? +C VIA nano ? + +C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). The rest of the code is quite crude, and could perhaps be made +C both smaller and faster. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') +define(`cy', `%r9') C for _nc variant + +ifdef(`OPERATION_addlsh_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(IFRSB, ) + define(func_n, mpn_addlsh_n) + define(func_nc, mpn_addlsh_nc)') +ifdef(`OPERATION_rsblsh_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(IFRSB, `$1') + define(func_n, mpn_rsblsh_n) + define(func_nc, mpn_rsblsh_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh_nc +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt + push %rbx + xor R32(%rbx), R32(%rbx) C clear CF save register +L(ent): push %rbp + mov R32(n), R32(%rbp) + mov n, %rax + mov R32(cnt), R32(%rcx) + neg R32(%rcx) + and $3, R32(%rbp) + jz L(b0) + lea -32(vp,%rbp,8), vp + lea -32(up,%rbp,8), up + lea -32(rp,%rbp,8), rp + cmp $2, R32(%rbp) + jc L(b1) + jz L(b2) + +L(b3): xor %r8, %r8 + mov 8(vp), %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $3, %rax + jz L(3) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo3) +L(3): add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd3) + +L(b0): mov (vp), %r8 + mov 8(vp), %r9 + xor R32(%rbp), R32(%rbp) + jmp L(lo0) + +L(b1): xor %r10, %r10 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $1, %rax + jz L(1) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 24(up), %r10 + lea 32(up), up + mov (vp), %r8 + jmp L(lo1) +L(1): add R32(%rbx), R32(%rbx) + ADCSBB 24(up), %r10 + jmp L(wd1) + +L(b2): xor %r9, %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $2, %rax + jz L(2) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo2) +L(2): add R32(%rbx), R32(%rbx) + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd2) + + ALIGN(32) C 16-byte alignment is not enough! +L(top): shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) + lea 32(up), up +L(lo3): mov %r8, 8(rp) +L(lo2): mov %r9, 16(rp) + mov (vp), %r8 +L(lo1): mov %r10, 24(rp) + mov 8(vp), %r9 + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rbx), R32(%rbx) +L(lo0): shrd R8(%rcx), %r8, %rbp + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + sub $4, %rax + jg L(top) + + shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) +L(wd3): mov %r8, 8(rp) +L(wd2): mov %r9, 16(rp) +L(wd1): mov %r10, 24(rp) + adc R32(%rax), R32(%rax) C rax is zero after loop + shr R8(%rcx), %r11 + ADDSUB %r11, %rax +IFRSB( neg %rax) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy + push %rbx + neg cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm new file mode 100644 index 0000000..61fee3e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm @@ -0,0 +1,203 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and +dnl Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.75\2.52 +C AMD K10 1.5 +C AMD bd1 1.69\2.25 +C AMD bd2 1.65 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 1.5 +C AMD bt1 2.67 +C AMD bt2 2.16 +C Intel P4 11.54 +C Intel PNR 5 +C Intel NHM 5.5 +C Intel SBR 1.54 +C Intel IBR 1.5 +C Intel HWL 1.32 +C Intel BWL 1.07 +C Intel SKL 1.21 +C Intel atom 4.3 +C Intel SLM 3 +C VIA nano ? + +C The loop of this code was manually written. It runs close to optimally on +C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems. +C It also runs slightly faster on average on AMD bd1 and bd2. +C +C No micro-optimisation has been done. +C +C N.B.! The loop alignment padding insns are executed. If editing the code, +C make sure the padding does not become excessive. It is now a 4-byte nop. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 + +L(ent): mov R32(n), R32(%rax) + shr $2, n + + test $1, R8(%rax) + jnz L(bx1) + +L(bx0): test $2, R8(%rax) + jnz L(b10) + +L(b00): neg %r8 + mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp + lea -16(rp), rp + jmp L(lo0) + +L(b10): neg %r8 + mov (up), %r10 + mov 8(up), %r11 + ADCSBB 0(vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(e2) + mov 16(up), %r8 + mov 24(up), %r9 + lea 16(up), up + ADCSBB 16(vp), %r8 + ADCSBB 24(vp), %r9 + lea 16(vp), vp +C lea (rp), rp + jmp L(lo2) + +L(e2): mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(bx1): test $2, R8(%rax) + jnz L(b11) + +L(b01): neg %r8 + mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(e1) + mov 8(up), %r8 + mov 16(up), %r9 + lea 8(up), up + lea -8(rp), rp + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + lea 8(vp), vp + jmp L(lo1) + +L(e1): mov %r11, (rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(b11): neg %r8 + mov (up), %r9 + ADCSBB (vp), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + lea 24(up), up + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + lea 24(vp), vp + mov %r9, (rp) + lea 8(rp), rp + jrcxz L(end) + + ALIGN(32) +L(top): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 +L(lo2): mov %r10, (rp) +L(lo1): mov %r11, 8(rp) + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp +L(lo0): mov %r8, 16(rp) +L(lo3): mov %r9, 24(rp) + lea 32(rp), rp + dec n + jnz L(top) + +L(end): mov R32(n), R32(%rax) C zero rax + mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret +EPILOGUE() + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm new file mode 100644 index 0000000..b4c1572 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm @@ -0,0 +1,212 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.27 +C AMD K10 4.27 4.54 +C AMD bull 4.76 +C AMD pile 4.55 +C AMD steam +C AMD excavator +C AMD bobcat 5.30 +C AMD jaguar 5.28 +C Intel P4 16.2 17.1 +C Intel core2 5.26 +C Intel NHM 5.09 +C Intel SBR 3.21 +C Intel IBR 2.96 +C Intel HWL 2.81 +C Intel BWL 2.76 +C Intel SKL 2.76 +C Intel atom 21.5 +C Intel SLM 9.5 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjörn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +define(`I',`$1') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'')') dnl +IFDOS(` define(`rp', ``%rcx'')') dnl +IFDOS(` define(`v0', ``%r9'')') dnl +IFDOS(` define(`r9', ``rdi'')') dnl +IFDOS(` define(`n_param',``%r8'')') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax + push %rbx + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b13) + +L(b02): xor R32(%r11), R32(%r11) + test $2, R8(n_param) + jnz L(b2) + +L(b0): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + jmp L(e0) + + ALIGN(16) +L(b2): mov $-1, n + sub n_param, n + mul v0 + mov 8(rp,n,8), %r8 + mov %rdx, %r9 + jmp L(e2) + + ALIGN(16) +L(b13): xor R32(%r9), R32(%r9) + test $2, R8(n_param) + jnz L(b3) + +L(b1): mov $2, R32(n) + sub n_param, n + jns L(1) + mul v0 + mov -16(rp,n,8), %r10 + mov %rdx, %r11 + jmp L(e1) + + ALIGN(16) +L(b3): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov (rp,n,8), %r10 + jmp L(e3) + + ALIGN(32) +L(top): mul v0 + mov -16(rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, -24(rp,n,8) +L(e1): ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc $0, %r11 + mul v0 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + adc $0, %r11 + mov %r10, -16(rp,n,8) +L(e0): ADDSUB %rax, %r8 + adc $0, %r9 + mov (up,n,8), %rax + mul v0 + mov (rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %r8, -8(rp,n,8) + adc $0, %r9 +L(e3): mov %rdx, %r11 + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc $0, %r11 + mul v0 + mov 8(rp,n,8), %r8 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov %r10, (rp,n,8) + adc $0, %r11 +L(e2): ADDSUB %rax, %r8 + adc $0, %r9 + mov 16(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + mov I(-8(rp),-16(rp,n,8)), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, I(-16(rp),-24(rp,n,8)) + ADDSUB %rax, %r10 + adc $0, %r11 + ADDSUB %r9, %r10 + adc $0, %r11 + mov %r10, I(-8(rp),-16(rp,n,8)) + mov %r11, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret + + ALIGN(16) +L(1): mul v0 + ADDSUB %rax, -8(rp) + mov %rdx, %rax + adc $0, %rax + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm new file mode 100644 index 0000000..43abcc8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm @@ -0,0 +1,174 @@ +dnl AMD64 mpn_cnd_add_n. + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 3.75 +C Intel SBR 1.93 +C Intel IBR 1.89 +C Intel HWL 1.78 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom +C Intel SLM 4.0 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +define(ADDSUB, add) +define(ADCSBB, adc) +define(func, mpn_cnd_add_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_add_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + and cnd, %rdi + and cnd, %r9 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + and cnd, %rdi + ADDSUB (up), %rdi + mov %rdi, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + lea 32(up), up + mov %r11, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm new file mode 100644 index 0000000..f55492b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm @@ -0,0 +1,200 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.15 +C Intel IBR 1.96 +C Intel HWL 2.0 +C Intel BWL 1.65 +C Intel SKL 1.65 +C Intel atom +C Intel SLM 4.5 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Given that we have a dedicated cnd_add_n, it might look strange that this +C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and +C this file's generality might come in handy for some pipeline. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + mov (up), %r12 + and cnd, %rdi + mov 8(up), %r13 + and cnd, %r9 + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + mov (up), %r12 + and cnd, %rdi + ADDSUB %rdi, %r12 + mov %r12, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + mov 24(up), %rax + lea 32(up), up + ADCSBB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + ADCSBB %r11, %rax + mov %rax, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm new file mode 100644 index 0000000..d9f371f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_divrem_1 + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1) +include_mpn(`x86_64/divrem_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h new file mode 100644 index 0000000..36f4512 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h @@ -0,0 +1,241 @@ +/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */ +/* FFT tuning limit = 468,152,320 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 30 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 462 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 396, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \ + { 95, 7}, { 1535, 8}, { 831,10}, { 223, 9}, \ + { 447,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1919,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8191,16}, { 4607,15}, { 9983,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 6}, \ + { 4351, 7}, { 2303, 8}, { 1215,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,10}, { 607,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 210 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 66 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 172 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 92 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 228 +#define REDC_1_TO_REDC_2_THRESHOLD 36 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1442 + +#define POWM_SEC_TABLE 1,16,191,452,1297 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 1160 +#define SET_STR_PRECOMPUTE_THRESHOLD 2043 + +#define FAC_DSC_THRESHOLD 426 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 5 /* 0.74% faster than 3 */ +#define HGCD_THRESHOLD 96 +#define HGCD_APPR_THRESHOLD 60 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 32.22% faster than 4 */ + +/* Tuneup completed successfully, took 276198 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm new file mode 100644 index 0000000..a1cbc31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm new file mode 100644 index 0000000..ac90edb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm new file mode 100644 index 0000000..a43a117 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm @@ -0,0 +1,199 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 +C Intel NHM +C Intel SBR 2.49 +C Intel IBR 2.32 +C Intel HWL 2.44 +C Intel BWL 2.43 +C Intel SKL 2.47 +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up_param',`%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 +define(`cin', `%r8') C stack + +define(`up', `%rsi') C same as rp_param +define(`n', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`rp', `%rcx')') +IFDOS(` define(`up_param',`%rdx')') +IFDOS(` define(`n_param', `%r8')') +IFDOS(` define(`v0', `%r9')') +IFDOS(` define(`cin', `48(%rsp)')') + +IFDOS(` define(`up', `%rsi')') +IFDOS(` define(`n', `%r8')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0) +L(x1): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01) + +L(11): mov 8(up,n,8), %rax + dec n + jmp L(L3) + +L(01): inc n + jnz L(L1) + mov %rax, (rp) + mov %rdx, %rax +IFDOS(` pop %rsi ') + ret + +L(x0): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up,n,8), %rax + test $2, R8(n) + jz L(L0) + +L(10): add $-2, n + jmp L(L2) + + ALIGN(8) +L(top): mov %rdx, %r10 + add %rax, %r11 +L(L1): mov 0(up,n,8), %rax + adc $0, %r10 + mul v0 + add %rax, %r10 + mov %r11, 0(rp,n,8) + mov 8(up,n,8), %rax + mov %rdx, %r11 +L(L0c): adc $0, %r11 +L(L0): mul v0 + mov %r10, 8(rp,n,8) + add %rax, %r11 + mov %rdx, %r10 +L(L3c): mov 16(up,n,8), %rax + adc $0, %r10 +L(L3): mul v0 + mov %r11, 16(rp,n,8) + mov %rdx, %r11 + add %rax, %r10 +L(L2c): mov 24(up,n,8), %rax + adc $0, %r11 +L(L2): mul v0 + mov %r10, 24(rp,n,8) + add $4, n + jnc L(top) + +L(end): add %rax, %r11 + mov %rdx, %rax + adc $0, %rax + mov %r11, (rp) + +IFDOS(` pop %rsi ') + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0c) +L(x1c): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01c) + +L(11c): add cin, %r11 + dec n + jmp L(L3c) + +L(01c): add cin, %r11 + inc n + jnz L(L1) + mov %r11, (rp) + mov %rdx, %rax + adc $0, %rax +IFDOS(` pop %rsi ') + ret + +L(x0c): mov %rax, %r10 + mov %rdx, %r11 + test $2, R8(n) + jz L(00c) + +L(10c): add $-2, n + add cin, %r10 + jmp L(L2c) + +L(00c): add cin, %r10 + mov 8(up,n,8), %rax + jmp L(L0c) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm new file mode 100644 index 0000000..781534d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 8.03 +C AMD K10 8.03 +C AMD bull 9.19 +C AMD pile 9.16 +C AMD steam +C AMD excavator +C AMD bobcat 10.6 +C AMD jaguar 11.0 +C Intel P4 26.0 +C Intel core2 8.73 +C Intel NHM 8.55 +C Intel SBR 5.15 +C Intel IBR 4.57 +C Intel HWL 4.08 +C Intel BWL 4.10 +C Intel SKL 4.14 +C Intel atom 39.5 +C Intel SLM 26.3 +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b1) + +L(b0): mov $0, R32(n) + sub n_param, n + xor w0, w0 + mul v0 + mov %rax, w2 + mov %rdx, w1 + mov (up,n,8), %rax + jmp L(lo0) + +L(b1): mov $1, R32(n) + sub n_param, n + xor w2, w2 + mul v0 + mov %rax, w0 + mov %rdx, w3 + mov -8(up,n,8), %rax + mul v1 + jmp L(lo1) + + ALIGN(32) +L(top): mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 C 1 + adc $0, w3 C 2 +L(lo1): add %rax, w2 C 2 + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + adc $0, w0 C 3 + mov (up,n,8), %rax + mul v0 + add %rax, w2 C 2 + mov %rdx, w1 C 3 + adc $0, w1 C 3 + add w3, w2 C 2 + mov (up,n,8), %rax + adc $0, w1 C 1 +L(lo0): mul v1 + mov w2, (rp,n,8) C 2 + add %rax, w0 C 3 + mov %rdx, w2 C 4 + mov 8(up,n,8), %rax + adc $0, w2 C 4 + add $2, n + jnc L(top) + +L(end): mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm new file mode 100644 index 0000000..35fd1cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm @@ -0,0 +1,407 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.5 - 2.95 +C Intel IBR 2.4 2.3 - 2.68 +C Intel HWL 2.35 2.0 - 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Fix the addmul_2 fluctuation affecting SBR. +C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in +C the loops at the expense of code size. +C * Adjoin a mul_3, avoiding slow mul_1 for odd vn. +C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight +C speedup. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) C FIXME +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,un,8), %rax + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + add %rax, X0 + adc $0, w1 + mov (up,un,8), %rax + mul v1 + mov 8(rp,un,8), X1 + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + xor w3, w3 + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + mov %rdx, w3 + add %rax, X1 + mov (up,un,8), %rax + mov 8(rp,un,8), X0 + adc $0, w3 + jmp L(top) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 +L(lo0): mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo3): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 +L(lo2): add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 C useless but harmless in final iter + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm new file mode 100644 index 0000000..a41a8ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm @@ -0,0 +1,384 @@ +dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.95 +C Intel IBR 2.3 2.68 +C Intel HWL 2.0 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r8') +define(`X0', `%r14') +define(`X1', `%r15') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`i', `%rbp') +define(`v0', `%r9') +define(`v1', `%rbx') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n,8), rp C point rp at R[un] + push %rbp + lea (up,n,8), up C point up right after U's end + push %r12 + neg n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n) + jnz L(m2b1) + +L(m2b0):lea (n), i + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(n), i + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,i,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,i,8) + mov %rdx, w0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,i,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,i,8) + add %rax, w0 + mov %rdx, w2 C FIXME: dead in last iteration + mov 8(up,i,8), %rax + adc $0, w2 C FIXME: dead in last iteration + add $2, i + jnc L(m2tp) + +L(m2ed):imul v0, %rax + add w0, %rax + add w1, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov (rp,n,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea 1(n), i + jmp L(lo0) + +L(a110):lea 3(n), i + mov %rdx, w3 + add %rax, X1 + mov (up,n,8), %rax + mov 8(rp,n,8), X0 + adc $0, w3 + jmp L(lo2) + +L(a1x1):mov (rp,n,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 2(n), i + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + mul v1 + mov 8(rp,n,8), X1 + jmp L(lo1) + +L(a111):lea (n), i + xor w3, w3 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,i,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,i,8), %rax + mul v1 + mov X1, -24(rp,i,8) + mov -8(rp,i,8), X1 + add w3, X0 + adc $0, w1 +L(lo1): mov %rdx, w2 + mov X0, -16(rp,i,8) + add %rax, X1 + adc $0, w2 + mov -8(up,i,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo0): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,i,8), %rax + mul v1 + add w1, X1 + mov (rp,i,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,i,8) + mov %rdx, w1 + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mov (up,i,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,i,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,i,8), %rax + mov X0, (rp,i,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,i,8), %rax + mov 16(rp,i,8), X0 + adc $0, w3 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add %rax, X0 + add w1, X1 + adc $0, w3 + add w2, X0 + mov I(-8(up),-16(up,i,8)), %rax + imul v0, %rax + add X0, %rax + mov X1, I(-16(rp),-24(rp,i,8)) + add w3, %rax + mov %rax, I(-8(rp),-16(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %r10 + imul v0, %r10 + mov -16(up), %r11 + imul v1, %r11 + mov %rax, -16(rp) + add %r10, %r11 + add %rdx, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm new file mode 100644 index 0000000..f0dbe07 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm @@ -0,0 +1,546 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR 3.24 +C Intel IBR 3.04 +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea 8(mp_param,n,8), mp + lea 8(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov %rdx, %r9 + mov (up,n,8), %rbx + add %rax, %rbx + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbx + mov %rbx, -8(up,i,8) C next low remainder limb + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 +L(e1): mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp1) + +L(ed1): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r9 + add %rax, %rbx + adc $0, %r9 + mov 8(mp,n,8), %rax + mul q0 + mov 8(up,n,8), %r10 + add %r11, %rbx + mov %rdx, %r11 + adc $0, %r9 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) +L(e3): add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp3) + +L(ed3): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea -8(up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea -8(up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov -8(mp,n,8), %rax + mul q0 + mov %rdx, %r9 + mov -8(up,n,8), %rbp + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,n,8), %rbp + add %r9, %rbx + mov %rdx, %r9 + mov %rbx, (up,n,8) + adc $0, %r11 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 +L(e0): add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp0) + +L(ed0): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %rbp + mov %rdx, %r9 + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + add %r9, %rbx + mov %rdx, %r9 + mov 8(up,n,8), %rbp + adc $0, %r11 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) +L(e2): add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp2) + +L(ed2): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -16(up), %rax + adc -8(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -16(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -24(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -8(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc (up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -32(mp), %rax + mov -32(up), %r10 + mul q0 + add %rax, %r10 + mov -24(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + add %r11, %rbp + mov -16(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -24(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -16(up) + mov %r11, -32(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm new file mode 100644 index 0000000..fd2eaea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm @@ -0,0 +1,193 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge. + +dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 4.25 +C Intel P4 21.5 +C Intel core2 3.2 +C Intel NHM 3.87 +C Intel SBR 2.05 +C Intel atom ? +C VIA nano 44.9 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %rbp + ADCSBB (vp), %rbp + + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rbp + ADDSUB (vp), %rbp +L(ent): + sbb R32(%rbx), R32(%rbx) C save cy + mov R32(%rbp), R32(%rax) + and $1, R32(%rax) C return value + + mov R32(n), R32(%r11) + and $3, R32(%r11) + + cmp $1, R32(%r11) + je L(do) C jump if n = 1 5 9 ... + +L(n1): cmp $2, R32(%r11) + jne L(n2) C jump unless n = 2 6 10 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r10 + ADCSBB 8(vp), %r10 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r10, %rbp + mov %rbp, -8(rp) + jmp L(cj1) + +L(n2): cmp $3, R32(%r11) + jne L(n3) C jump unless n = 3 7 11 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r9 + mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r9, %rbp + mov %rbp, -16(rp) + jmp L(cj2) + +L(n3): dec n C come here for n = 4 8 12 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + ADCSBB 24(vp), %r10 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, -24(rp) + shrd $1, %r9, %r8 + mov %r8, -16(rp) +L(cj2): shrd $1, %r10, %r9 + mov %r9, -8(rp) +L(cj1): mov %r10, %rbp + +L(do): + shr $2, n C 4 + je L(end) C 2 + ALIGN(16) +L(top): add R32(%rbx), R32(%rbx) C restore cy + + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + mov 32(up), %r11 + ADCSBB 24(vp), %r10 + ADCSBB 32(vp), %r11 + + lea 32(up), up + lea 32(vp), vp + + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, (rp) + shrd $1, %r9, %r8 + mov %r8, 8(rp) + shrd $1, %r10, %r9 + mov %r9, 16(rp) + shrd $1, %r11, %r10 + mov %r10, 24(rp) + + dec n + mov %r11, %rbp + lea 32(rp), rp + jne L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm new file mode 100644 index 0000000..4c1c0d4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm new file mode 100644 index 0000000..46a3612 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm @@ -0,0 +1,484 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 ? ? ? +C AMD K10 ? ? ? +C AMD bull ? ? ? +C AMD pile ? ? ? +C AMD steam ? ? ? +C AMD bobcat ? ? ? +C AMD jaguar ? ? ? +C Intel P4 ? ? ? +C Intel core ? ? ? +C Intel NHM ? ? ? +C Intel SBR 2.57 2.93 3.0 +C Intel IBR 2.35 2.66 3.0 +C Intel HWL 2.02 2.5 2.5 +C Intel BWL ? ? ? +C Intel atom ? ? ? +C VIA nano ? ? ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund, except +C that the sqr_diag_addlsh1 loop was manually written. + +C TODO +C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy. +C * Streamline pointer updates. +C * Perhaps suppress a few more xor insns in feed-in code. +C * Make sure we write no dead registers in feed-in code. +C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch +C out for negative sizes being zero-extended, though. +C * The straight-line code for n <= 3 comes from the K8 code, and might be +C quite sub-optimal here. Write specific code, and add code for n = 4. +C * The mul_2 loop has a 10 insn common sequence in the loop start and the +C wind-down code. Try re-rolling it. +C * This file has been the subject to just basic micro-optimisation. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rax + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w2', `%r11') + + mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(gt3): + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%rbx') +define(`w3', `%rbp') +define(`un', `%r12') +define(`n', `%rcx') + +define(`X0', `%r13') +define(`X1', `%r14') + +L(do_mul_2): + mov (up), v0 + push %rbx + lea (rp,un_param,8), rp C point rp at R[un] + mov 8(up), %rax + push %rbp + lea (up,un_param,8), up C point up right after U's end + mov %rax, v1 + push %r12 + mov $1, R32(un) C free up rdx + push %r13 + sub un_param, un + push %r14 + push un + mul v0 + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea 2(un), n + xor R32(w1), R32(w1) C FIXME + xor R32(w2), R32(w2) C FIXME + mov %rdx, w0 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor R32(w3), R32(w3) C FIXME + xor R32(w0), R32(w0) C FIXME + mov %rdx, w2 + jmp L(m2l1) + + ALIGN(32) +L(m2tp): +L(m2l0):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax +L(m2l1):mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 + mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $2, un C decrease |un| + +L(do_addmul_2): +L(outer): + lea 16(rp), rp + cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1} + jge L(corner) C FIXME: move to before the lea above + + mov -8(up,un,8), v0 + mov (up,un,8), %rax + mov %rax, v1 + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov 8(rp,un,8), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, (rp,un,8) + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + mov 8(rp,un,8), X0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + xor w0, w0 + mov 8(up,un,8), %rax + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + jmp L(lo1) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax +L(lo1): mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 + mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 +L(lo0): mul v0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax +L(lo3): mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) +L(lo2): mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + add $2, un C decrease |un| + jmp L(outer) C loop until a small corner remains + +L(corner): + pop n + jg L(small_corner) + + lea 8(rp), rp + mov -24(up), v0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov -24(rp), X0 + mov -16(rp), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, -24(rp) + mov -8(up), %rax + mul v0 + add $0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov -8(up), %rax + adc $0, w3 + mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, -16(rp) + jmp L(com) + +L(small_corner): + mov -8(rp), w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 +L(com): add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + mul %rax + mov %rax, (rp,n,8) + + xor R32(%rbx), R32(%rbx) + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + jmp L(dm) + + ALIGN(32) +L(dtop):add %r8, %r10 + adc %r9, %rax + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + mov %r10, -8(rp,n,8) + mov %rax, (rp,n,8) +L(dm): adc %r8, %r8 + adc %r9, %r9 + mov (up,n,4), %rax + lea (%rdx,%rbx), %r10 + setc R8(%rbx) + mul %rax + add $2, n + js L(dtop) + +L(dend):add %r8, %r10 + adc %r9, %rax + mov %r10, I(-8(rp),-8(rp,n,8)) + mov %rax, I((rp),(rp,n,8)) + adc %rbx, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() |