From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/alpha/divrem_2.asm | 177 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 gmp-6.3.0/mpn/alpha/divrem_2.asm (limited to 'gmp-6.3.0/mpn/alpha/divrem_2.asm') diff --git a/gmp-6.3.0/mpn/alpha/divrem_2.asm b/gmp-6.3.0/mpn/alpha/divrem_2.asm new file mode 100644 index 0000000..046b246 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/divrem_2.asm @@ -0,0 +1,177 @@ +dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm frac +C ev4 +C ev5 70 70 +C ev6 29 29 + +C TODO +C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore +C any registers (thus save ~10 cycles per call). +C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle +C or two. +C * Check cluster delays (for ev6). We very likely could save some cycles. +C * Use branch-free code for computing di. +C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. + +C INPUT PARAMETERS +define(`qp', `r16') +define(`fn', `r17') +define(`up_param', `r18') +define(`un_param', `r19') +define(`dp', `r20') + +ASM_START() +PROLOGUE(mpn_divrem_2,gp) + lda r30, -80(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + stq r12, 32(r30) + stq r13, 40(r30) +C stq r14, 48(r30) + stq r15, 56(r30) + .prologue 1 + stq r16, 64(r30) + bis r31, r17, r15 + s8addq r19, r18, r13 + lda r13, -24(r13) + ldq r12, 8(r20) + ldq r10, 0(r20) + ldq r11, 16(r13) + ldq r9, 8(r13) + + bis r31, r31, r3 C most_significant_q_limb = 0 + cmpult r11, r12, r1 + bne r1, L(L8) + cmpule r11, r12, r1 + cmpult r9, r10, r2 + and r1, r2, r1 + bne r1, L(L8) + subq r11, r12, r11 + subq r11, r2, r11 + subq r9, r10, r9 + lda r3, 1(r31) C most_significant_q_limb = 1 +L(L8): stq r3, 72(r30) + + addq r15, r19, r19 + lda r19, -3(r19) + blt r19, L(L10) + bis r31, r12, r16 + jsr r26, mpn_invert_limb + LDGP( r29, 0(r26)) + mulq r0, r12, r4 C t0 = LO(di * d1) + umulh r0, r10, r2 C s1 = HI(di * d0) + addq r4, r10, r4 C t0 += d0 + cmpule r10, r4, r7 C (t0 < d0) + addq r4, r2, r4 C t0 += s1 + cmpult r4, r2, r1 + subq r1, r7, r7 C t1 (-1, 0, or 1) + blt r7, L(L42) +L(L22): + lda r0, -1(r0) C di-- + cmpult r4, r12, r1 C cy for: t0 -= d1 (below) + subq r7, r1, r7 C t1 -= cy + subq r4, r12, r4 C t0 -= d1 + bge r7, L(L22) +L(L42): + ldq r16, 64(r30) + s8addq r19, r16, r16 + ALIGN(16) +L(loop): + mulq r11, r0, r5 C q0 (early) + umulh r11, r0, r6 C q (early) + addq r5, r9, r8 C q0 += n1 + addq r6, r11, r6 C q += n2 + cmpult r8, r5, r1 C cy for: q0 += n1 + addq r6, r1, r6 C q += cy + unop + mulq r12, r6, r1 C LO(d1 * q) + umulh r10, r6, r7 C t1 = HI(d0 * q) + subq r9, r1, r9 C n1 -= LO(d1 * q) + mulq r10, r6, r4 C t0 = LO(d0 * q) + unop + cmple r15, r19, r5 C condition and n0... + beq r5, L(L31) + ldq r5, 0(r13) + lda r13, -8(r13) +L(L31): subq r9, r12, r9 C n1 -= d1 + cmpult r5, r10, r1 C + subq r9, r1, r9 C + subq r5, r10, r5 C n0 -= d0 + subq r9, r7, r9 C n1 -= t0 + cmpult r5, r4, r1 C + subq r9, r1, r2 C + subq r5, r4, r5 C n0 -= t1 + cmpult r2, r8, r1 C (n1 < q0) + addq r6, r1, r6 C q += cond + lda r1, -1(r1) C -(n1 >= q0) + and r1, r10, r4 C + addq r5, r4, r9 C n0 += mask & d0 + and r1, r12, r1 C + cmpult r9, r5, r11 C cy for: n0 += mask & d0 + addq r2, r1, r1 C n1 += mask & d1 + addq r1, r11, r11 C n1 += cy + cmpult r11, r12, r1 C + beq r1, L(fix) C +L(bck): stq r6, 0(r16) + lda r16, -8(r16) + lda r19, -1(r19) + bge r19, L(loop) + +L(L10): stq r9, 8(r13) + stq r11, 16(r13) + ldq r0, 72(r30) + ldq r26, 0(r30) + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) +C ldq r14, 48(r30) + ldq r15, 56(r30) + lda r30, 80(r30) + ret r31, (r26), 1 + +L(fix): cmpule r11, r12, r1 + cmpult r9, r10, r2 + and r1, r2, r1 + bne r1, L(bck) + subq r11, r12, r11 + subq r11, r2, r11 + subq r9, r10, r9 + lda r6, 1(r6) + br L(bck) +EPILOGUE() +ASM_END() -- cgit v1.2.3