diff options
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm')
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm new file mode 100644 index 0000000..5cdb209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -0,0 +1,395 @@ +dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`jmpreg',`%rbx') +define(`nn', `%rbp') + +C TODO +C * Suppress more rp[] rewrites in corner. +C * Rearrange feed-in jumps for short branch forms. +C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since +C feed-in code implodes, the blow-up will not be more than perhaps 4x. +C * Micro-optimise critical lead-in code block around L(ent). +C * Write n < 4 code specifically for Broadwell (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r14 + push %r12 + push %rbx + push %rbp + mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end + imul (up), %r14 C FIXME Put at absolute end + lea -3(n), R32(nn) + lea 8(vp_param), vp + mov (vp_param), %rdx + + mov R32(n), R32(%rax) + shr $3, R32(n) + and $7, R32(%rax) C clear OF, CF as side-effect + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + jrcxz L(mc) + inc R32(n) + lea L(f2)(%rip), jmpreg + jmp L(mb3) + +L(mc): mulx( -8,(up), %r10, %r8) + add %rax, %r10 + mov %r9, -16(rp) + mulx( (up), %r9, %rax) + mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + jmp L(c2) + +L(mf4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx( (up), %r9, %rax) + lea L(f0)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + lea L(f1)(%rip), jmpreg + mulx( (up), %r9, %rax) + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(mtop):mov %r10, -8(rp) + adc %r8, %r9 +L(mb1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(mb0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(mb7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(mb6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(mb5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(mb4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(mb3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec R32(n) + mulx( (up), %r9, %rax) + jnz L(mtop) + +L(mend):mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + + lea 8(,nn,8), %r12 + neg %r12 + shr $3, R32(nn) + jmp L(ent) + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f1): mulx( (up), %r9, %rax) + lea -1(nn), R32(nn) + lea L(f0)(%rip), jmpreg + jmp L(b1) + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits + lea 8(%r12), %r12 +L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) + add %rax, %r14 + add %r10, %r14 C h + lea (up,%r12), up C reset up + lea 8(rp,%r12), rp C reset rp + mov (vp), %rdx + lea 8(vp), vp + or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) + jmp *jmpreg + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + lea L(f1)(%rip), jmpreg + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), R32(n) + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jrcxz L(cor) + lea L(f2)(%rip), jmpreg + jmp L(b3) + +L(cor): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) C FIXME suppress + adox( (rp), %r9) + mov %r9, (rp) C FIXME suppress + adox( %rcx, %rax) +L(c2): + mulx( 8,(up), %r10, %r8) + adc %rax, %r14 + add %r10, %r14 + mov (vp), %rdx + test R32(%rcx), R32(%rcx) + mulx( -16,(up), %r10, %r8) + mulx( -8,(up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + adox( (rp), %r9) + adox( %rcx, %rax) + adc %rcx, %rax + mulx( (up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov 8(vp), %rdx + mulx( -16,(up), %rcx, %rax) + add %r9, %rcx + mov %rcx, (rp) + adc $0, %rax + mulx( -8,(up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov %r14, 8(rp) + pop %rbp + pop %rbx + pop %r12 + pop %r14 + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) |