From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm | 863 ++++++++++++++++++++++++ 1 file changed, 863 insertions(+) create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm (limited to 'gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm new file mode 100644 index 0000000..e76bb88 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm @@ -0,0 +1,863 @@ +dnl PowerPC-64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 16.25 +C POWER7 3.77 + +C NOTES +C * This is very crude, cleanup! +C * Try to reduce the number of needed live registers. +C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The +C cost will be more live registers. +C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code +C size a lot and speed things up perhaps 25%. +C * Use computed goto in order to compress the code. +C * Implement a larger final corner. +C * Schedule callee-saves register saves into other insns. This could save +C about 5 cycles/call. (We cannot analogously optimise the restores, since +C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) +C * Should the alternating std/adde sequences be split? Some pipelines handle +C adde poorly, and might sequentialise all these instructions. +C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for +C adjacent integer multiply insns. Except for the multiply insns, the code +C was not carefully optimised for POWER6 or any other CPU. +C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`rp_outer', `r25') +define(`up_outer', `r21') +define(`rp_saved', `r22') +define(`up_saved', `r23') +define(`n_saved', `r24') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + cmpdi cr0, n, 2 + bge cr0, L(ge2) + ld r5, 0(up) C n = 1 + nop + mulld r8, r5, r5 C weight 0 + mulhdu r9, r5, r5 C weight 1 + std r8, 0(rp) + std r9, 8(rp) + blr + ALIGN(16) +L(ge2): bgt cr0, L(gt2) + ld r0, 0(up) C n = 2 + nop + mulld r8, r0, r0 C u0 * u0 + mulhdu r9, r0, r0 C u0 * u0 + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r8, 0(rp) + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + + ALIGN(16) +L(gt2): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + + mr rp_saved, rp + mr up_saved, up + mr n_saved, n + mr rp_outer, rp + mr up_outer, up + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic r7, n, 2 C compute count... + srdi r7, r7, 2 C ...for ctr + mtctr r7 C copy count into ctr + beq- cr0, L(b0) + blt- cr6, L(b1) + beq- cr6, L(b2) + +L(b3): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 + li r12, 0 C carry limb + bdz L(em3) + + ALIGN(16) +L(tm3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm3) + +L(em3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop) + +L(b0): ld r6, 0(up) + ld r27, 8(up) + mulld r7, r27, r6 + mulhdu r12, r27, r6 + std r7, 8(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(em0) + + ALIGN(16) +L(tm0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm0) + +L(em0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_2) + +L(b1): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r12, r27, r6 + addc r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(em1) + + ALIGN(16) +L(tm1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm1) + +L(em1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_3) + +L(b2): addi r7, r7, -1 C FIXME + mtctr r7 C FIXME + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + std r0, 8(rp) + std r7, 16(rp) + std r11, 24(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(em2) + + ALIGN(16) +L(tm2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm2) + +L(em2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_0) + + +L(outer_loop): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + bdz L(outer_end) + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + adde r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(ea1) + + ALIGN(16) +L(ta1): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta1) + +L(ea1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_0): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r28 + adde r7, r7, r26 + addze r12, r8 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(ea0) + + ALIGN(16) +L(ta0): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta0) + +L(ea0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_3): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + addc r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(ea3) + + ALIGN(16) +L(ta3): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta3) + +L(ea3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + +L(outer_loop_ent_2): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + bdz L(ea2) + addi up, up, 24 + + ALIGN(16) +L(ta2): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta2) + +L(ea2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + b L(outer_loop) + +L(outer_end): + ld r6, 0(up) + ld r9, 8(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + addc r0, r0, r11 + std r0, 0(rp) + addze r8, r8 + std r8, 8(rp) + +define(`rp', `rp_saved') +define(`up', `r5') +define(`n', `r6') +define(`climb', `r0') + + addi r4, rp_saved, 8 + mr r5, up_saved + mr r6, n_saved + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 2 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C put loop count into ctr + beq cr0, L(xb0) + blt cr6, L(xb1) + beq cr6, L(xb2) + +L(xb3): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + addi up, up, 24 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + addze climb, r29 + addc r10, r10, r25 + adde r11, r11, r26 + adde r6, r6, r27 + adde r7, r7, r28 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + std r6, 24(rp) + std r7, 32(rp) + addi rp, rp, 40 + bdnz L(top) + b L(end) + +L(xb2): ld r6, 0(up) + ld r7, 8(up) + addi up, up, 16 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + ld r10, 8(rp) + ld r11, 16(rp) + addc r10, r10, r10 + adde r11, r11, r11 + addze climb, r27 + addc r10, r10, r25 + adde r11, r11, r26 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + addi rp, rp, 24 + bdnz L(top) + b L(end) + +L(xb0): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + ld r12, 40(rp) + ld r23, 48(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + adde r12, r12, r12 + adde r23, r23, r23 + addze climb, r31 + std r24, 0(rp) + addc r10, r10, r25 + std r10, 8(rp) + adde r11, r11, r26 + std r11, 16(rp) + adde r6, r6, r27 + std r6, 24(rp) + adde r7, r7, r28 + std r7, 32(rp) + adde r12, r12, r29 + std r12, 40(rp) + adde r23, r23, r30 + std r23, 48(rp) + addi rp, rp, 56 + bdnz L(top) + b L(end) + +L(xb1): ld r6, 0(up) + addi up, up, 8 + mulld r24, r6, r6 + mulhdu climb, r6, r6 + std r24, 0(rp) + addic rp, rp, 8 C clear carry as side-effect + + ALIGN(32) +L(top): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r8, 0(rp) + ld r9, 8(rp) + adde r8, r8, r8 + adde r9, r9, r9 + ld r10, 16(rp) + ld r11, 24(rp) + adde r10, r10, r10 + adde r11, r11, r11 + ld r6, 32(rp) + ld r7, 40(rp) + adde r6, r6, r6 + adde r7, r7, r7 + ld r12, 48(rp) + ld r23, 56(rp) + adde r12, r12, r12 + adde r23, r23, r23 + addze r31, r31 + addc r8, r8, climb + std r8, 0(rp) + adde r9, r9, r24 + std r9, 8(rp) + adde r10, r10, r25 + std r10, 16(rp) + adde r11, r11, r26 + std r11, 24(rp) + adde r6, r6, r27 + std r6, 32(rp) + adde r7, r7, r28 + std r7, 40(rp) + adde r12, r12, r29 + std r12, 48(rp) + adde r23, r23, r30 + std r23, 56(rp) + mr climb, r31 + addi rp, rp, 64 + bdnz L(top) + +L(end): addze climb, climb + std climb, 0(rp) + + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + blr +EPILOGUE() -- cgit v1.2.3