From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/divrem_2.asm | 280 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/divrem_2.asm (limited to 'gmp-6.3.0/mpn/ia64/divrem_2.asm') diff --git a/gmp-6.3.0/mpn/ia64/divrem_2.asm b/gmp-6.3.0/mpn/ia64/divrem_2.asm new file mode 100644 index 0000000..9864311 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/divrem_2.asm @@ -0,0 +1,280 @@ +dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm frac +C itanium 1 +C itanium 2 29 29 + + +C TODO +C * Inline and interleave limb inversion code with loop setup code. +C * We should use explicit bundling in much of the code, since it typically +C cuts some cycles with the GNU assembler. + + +ASM_START() + +C HP's assembler requires these declarations for importing mpn_invert_limb + .global mpn_invert_limb + .type mpn_invert_limb,@function + +C INPUT PARAMETERS +C qp = r32 +C fn = r33 +C np = r34 +C nn = r35 +C dp = r36 + +define(`f0x1', `f15') + +ASM_START() +PROLOGUE(mpn_divrem_2) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 C M I + addp4 r34 = 0, r34 C M I + zxt4 r35 = r35 C I + addp4 r36 = 0, r36 C M I + nop.m 0 + zxt4 r33 = r33 C I + ;; +') + .save ar.pfs, r42 + alloc r42 = ar.pfs, 5, 9, 1, 0 + shladd r34 = r35, 3, r34 + adds r14 = 8, r36 + mov r43 = r1 + ;; + adds r15 = -8, r34 + ld8 r39 = [r14] + .save ar.lc, r45 + mov r45 = ar.lc + adds r14 = -16, r34 + mov r40 = r0 + adds r34 = -24, r34 + ;; + ld8 r38 = [r15] + .save rp, r41 + mov r41 = b0 + .body + ld8 r36 = [r36] + ld8 r37 = [r14] + ;; + cmp.gtu p6, p7 = r39, r38 + (p6) br.cond.dptk .L8 + ;; + cmp.leu p8, p9 = r36, r37 + cmp.geu p6, p7 = r39, r38 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p7) br.cond.dptk .L51 +.L8: + add r14 = r33, r35 // un + fn + mov r46 = r39 // argument to mpn_invert_limb + ;; + adds r35 = -3, r14 + ;; + cmp.gt p12, p0 = r0, r35 + (p12) br.cond.dpnt L(end) + br.call.sptk.many b0 = mpn_invert_limb + ;; + setf.sig f11 = r8 // di (non-final) + setf.sig f34 = r39 // d1 + setf.sig f33 = r36 // d0 + mov r1 = r43 + ;; + mov r17 = 1 + setf.sig f9 = r38 // n2 + xma.l f6 = f11, f34, f0 // t0 = LO(di * d1) + ;; + setf.sig f10 = r37 // n1 + setf.sig f15 = r17 // 1 + xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0) + ;; + getf.sig r17 = f6 + getf.sig r16 = f8 + mov ar.lc = r35 + ;; + sub r18 = r0, r39 // -d1 + add r14 = r17, r36 + ;; + setf.sig f14 = r18 // -d1 + cmp.leu p8, p9 = r17, r14 + add r16 = r14, r16 + ;; + (p9) adds r19 = 0, r0 + (p8) adds r19 = -1, r0 + cmp.gtu p6, p7 = r14, r16 + ;; + (p6) adds r19 = 1, r19 + ;; +ifelse(1,1,` + cmp.gt p7, p6 = r0, r19 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) cmp.gt p7, p6 = 1, r19 + (p9) cmp.gt p7, p6 = 0, r19 + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 +',` + cmp.gt p8, p9 = r0, r19 + (p8) br.cond.dpnt .L46 +.L52: + cmp.leu p6, p7 = r39, r16 + sub r14 = r16, r39 + adds r8 = -1, r8 + ;; + (p7) adds r19 = -1, r19 + mov r16 = r14 + ;; + (p7) cmp.gt p8, p9 = r0, r19 + (p9) br.cond.dptk .L52 +.L46: +') + setf.sig f32 = r8 // di + shladd r32 = r35, 3, r32 + ;; + + ALIGN(16) +L(top): nop 0 + nop 0 + cmp.gt p8, p9 = r33, r35 + ;; + (p8) mov r37 = r0 + (p9) ld8 r37 = [r34], -8 + xma.hu f8 = f9, f32, f10 // 0,29 + xma.l f12 = f9, f32, f10 // 0 + ;; + getf.sig r20 = f12 // q0 4 + xma.l f13 = f15, f8, f9 // q += n2 4 + sub r8 = -1, r36 // bitnot d0 + ;; + getf.sig r18 = f13 // 8 + xma.l f7 = f14, f13, f10 // 8 + xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8 + xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9 + ;; + getf.sig r38 = f7 // n1 12 + getf.sig r16 = f6 // 13 + getf.sig r19 = f9 // 14 + ;; + sub r38 = r38, r39 // n1 -= d1 17 + ;; + cmp.ne p9, p0 = r0, r0 // clear p9 + cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18 + ;; + sub r37 = r37, r16 // n0 -= t0 19 + (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19 + (p10) sub r38 = r38, r19 // n1 -= t1 19 + ;; + cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20 + ;; + (p7) cmp.ltu p9, p0 = r8, r37 // 21 + (p6) add r18 = 1, r18 // + (p7) add r37 = r37, r36 // 21 + (p7) add r38 = r38, r39 // 21 + ;; + setf.sig f10 = r37 // n1 22 + (p9) add r38 = 1, r38 // 22 + ;; + setf.sig f9 = r38 // n2 23 + cmp.gtu p6, p7 = r39, r38 // 23 + (p7) br.cond.spnt L(fix) +L(bck): st8 [r32] = r18, -8 + adds r35 = -1, r35 + br.cloop.sptk.few L(top) + ;; + +L(end): add r14 = 8, r34 + add r15 = 16, r34 + mov b0 = r41 + ;; + st8 [r14] = r37 + st8 [r15] = r38 + mov ar.pfs = r42 + mov r8 = r40 + mov ar.lc = r45 + br.ret.sptk.many b0 + ;; +.L51: + .pred.rel "mutex", p8, p9 + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r40 = 1, r0 + br .L8 + ;; + +L(fix): cmp.geu p6, p7 = r39, r38 + cmp.leu p8, p9 = r36, r37 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p6) br.cond.dptk L(bck) + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r18 = 1, r18 + ;; + setf.sig f9 = r38 // n2 + setf.sig f10 = r37 // n1 + br L(bck) + +EPILOGUE() +ASM_END() -- cgit v1.2.3