diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm')
-rw-r--r-- | gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm new file mode 100644 index 0000000..6158f54 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm @@ -0,0 +1,187 @@ +dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C POWER3/PPC630 1.83 (1.5 c/l should be possible) +C POWER4/PPC970 3 (2.0 c/l should be possible) +C POWER5 3 +C POWER6 3.5-47 +C POWER7 3 + +C STATUS +C * Try combining upx+up, and vpx+vp. +C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is +C greater than the 2nd operand. Yes, this addition is non-commutative wrt +C performance. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`DO_add', ` + define(`ADDSUBC', `addc $1, $2, $3') + define(`ADDSUBE', `adde $1, $2, $3') + define(INITCY, `addic $1, r1, 0') + define(RETVAL, `addze r3, $1') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUBC', `subfc $1, $2, $3') + define(`ADDSUBE', `subfe $1, $2, $3') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `subfze r3, $1 + neg r3, r3') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUBC', `subfc $1, $3, $2') + define(`ADDSUBE', `subfe $1, $3, $2') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `addme r3, $1') + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(`rpx', `r6') +define(`upx', `r7') +define(`vpx', `r12') + +define(`s0', `r0') define(`s1', `r9') +define(`u0', `r8') +define(`v0', `r10') define(`v1', `r11') + + +ASM_START() +PROLOGUE(func) + cmpldi cr0, n, 13 + bgt L(big) + + mtctr n C copy n in ctr + INITCY( r0) C clear cy + + ld v0, 0(vp) C load v limb + ld u0, 0(up) C load u limb + addi up, up, -8 C update up + addi rp, rp, -8 C update rp + sldi s1, v0, LSH + bdz L(ex1) C If done, skip loop + + ALIGN(16) +L(lo0): ld v1, 8(vp) C load v limb + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + ldu u0, 16(up) C load u limb and update up + srdi s0, v0, RSH C shift down previous v limb + std s1, 8(rp) C store result limb + rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb + bdz L(ex0) C decrement ctr and exit if done + ldu v0, 16(vp) C load v limb and update vp + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + ld u0, 8(up) C load u limb + srdi s1, v1, RSH C shift down previous v limb + stdu s0, 16(rp) C store result limb and update rp + rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb + bdnz L(lo0) C decrement ctr and loop back + +L(ex1): ADDSUBE(r7, s1, u0) + std r7, 8(rp) C store last result limb + srdi r0, v0, RSH + RETVAL( r0) + blr +L(ex0): ADDSUBE(r7, s0, u0) + std r7, 16(rp) C store last result limb + srdi r0, v1, RSH + RETVAL( r0) + blr + + +L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0 + addi r6, n, -1 C ...for ctr + srdi r6, r6, 1 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b0) + +L(b1): ld v1, 0(vp) + ld u0, 0(up) + sldi s1, v1, LSH + srdi s0, v1, RSH + ld v0, 8(vp) + ADDSUBC(s1, s1, u0) C add limbs without cy, set cy + addi rpx, rp, -16 + addi rp, rp, -8 + sub upx, up, rp + sub vpx, vp, rp + sub up, up, rpx + sub vp, vp, rpx + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + b L(mid) + +L(b0): ld v0, 0(vp) + ld u0, 0(up) + sldi s0, v0, LSH + srdi s1, v0, RSH + ld v1, 8(vp) + ADDSUBC(s0, s0, u0) C add limbs without cy, set cy + addi rpx, rp, -8 + addi rp, rp, -16 + sub upx, up, rpx + sub vpx, vp, rpx + sub up, up, rp + sub vp, vp, rp + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + + ALIGN(32) +L(top): ldx u0, rp, up + ldx v0, rp, vp + rldimi s1, v1, LSH, 0 + stdu s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy +L(mid): ldx u0, rpx, upx + ldx v1, rpx, vpx + rldimi s0, v0, LSH, 0 + stdu s1, 16(rpx) + srdi s1, v0, RSH + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + bdnz L(top) C decrement CTR and loop back + + ldx u0, rp, up + rldimi s1, v1, LSH, 0 + std s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + std s1, 24(rp) + + RETVAL( s0) + blr +EPILOGUE() |