From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/powerpc64/rshift.asm | 207 +++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 gmp-6.3.0/mpn/powerpc64/rshift.asm (limited to 'gmp-6.3.0/mpn/powerpc64/rshift.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/rshift.asm b/gmp-6.3.0/mpn/powerpc64/rshift.asm new file mode 100644 index 0000000..7654a16 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/rshift.asm @@ -0,0 +1,207 @@ +dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt + +dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 + +C TODO +C * Try to reduce the number of needed live registers +C * Micro-optimise header code +C * Keep in synch with lshift.asm and lshiftc.asm + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`u0',`r30') +define(`u1',`r31') +define(`retval',`r5') + +ASM_START() +PROLOGUE(mpn_rshift) + std r31, -8(r1) + std r30, -16(r1) + subfic tnc, cnt, 64 +C sldi r30, n, 3 C byte count corresponding to n +C add rp, rp, r30 C rp = rp + n +C add up, up, r30 C up = up + n + rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 + cmpdi cr6, r30, 2 + addi r31, n, 3 C compute count... + ld r10, 0(up) C load 1st limb for b00...b11 + sld retval, r10, tnc +ifdef(`HAVE_ABI_mode32', +` rldicl r31, r31, 62,34', C ...branch count +` srdi r31, r31, 2') C ...for ctr + mtctr r31 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + ld r11, 8(up) C load 2nd limb for b10 and b11 + beq cr6, L(b10) + + ALIGN(16) +L(b11): srd r8, r10, cnt + sld r9, r11, tnc + ld u1, 16(up) + addi up, up, 24 + srd r12, r11, cnt + sld r7, u1, tnc + addi rp, rp, -16 + bdnz L(gt3) + + or r11, r8, r9 + srd r8, u1, cnt + b L(cj3) + + ALIGN(16) +L(gt3): ld u0, 0(up) + or r11, r8, r9 + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 8(up) + or r10, r12, r7 + b L(L11) + + ALIGN(32) +L(b10): srd r12, r10, cnt + addi rp, rp, -24 + sld r7, r11, tnc + bdnz L(gt2) + + srd r8, r11, cnt + or r10, r12, r7 + b L(cj2) + +L(gt2): ld u0, 16(up) + srd r8, r11, cnt + sld r9, u0, tnc + ld u1, 24(up) + or r10, r12, r7 + srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 32(up) + or r11, r8, r9 + addi up, up, 16 + b L(L10) + + ALIGN(16) +L(b00): ld u1, 8(up) + srd r12, r10, cnt + sld r7, u1, tnc + ld u0, 16(up) + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 24(up) + or r10, r12, r7 + srd r12, u0, cnt + sld r7, u1, tnc + addi rp, rp, -8 + bdz L(cj4) + +L(gt4): addi up, up, 32 + ld u0, 0(up) + or r11, r8, r9 + b L(L00) + + ALIGN(16) +L(b01): bdnz L(gt1) + srd r8, r10, cnt + std r8, 0(rp) + b L(ret) + +L(gt1): ld u0, 8(up) + srd r8, r10, cnt + sld r9, u0, tnc + ld u1, 16(up) + srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 24(up) + or r11, r8, r9 + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 32(up) + addi up, up, 40 + or r10, r12, r7 + bdz L(end) + + ALIGN(32) +L(top): srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 0(up) + std r11, 0(rp) + or r11, r8, r9 +L(L00): srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 8(up) + std r10, 8(rp) + or r10, r12, r7 +L(L11): srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 16(up) + std r11, 16(rp) + or r11, r8, r9 +L(L10): srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 24(up) + addi up, up, 32 + std r10, 24(rp) + addi rp, rp, 32 + or r10, r12, r7 + bdnz L(top) + + ALIGN(32) +L(end): srd r12, u0, cnt + sld r7, u1, tnc + std r11, 0(rp) +L(cj4): or r11, r8, r9 + srd r8, u1, cnt + std r10, 8(rp) +L(cj3): or r10, r12, r7 + std r11, 16(rp) +L(cj2): std r10, 24(rp) + std r8, 32(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') + blr +EPILOGUE() -- cgit v1.2.3