From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/mul_1.asm | 584 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 584 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/mul_1.asm (limited to 'gmp-6.3.0/mpn/ia64/mul_1.asm') diff --git a/gmp-6.3.0/mpn/ia64/mul_1.asm b/gmp-6.3.0/mpn/ia64/mul_1.asm new file mode 100644 index 0000000..21bf6d0 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mul_1.asm @@ -0,0 +1,584 @@ +dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and +dnl store the result in a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.0 + +C TODO +C * Further optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Use 1 c/l carry propagation scheme in wind-down code. +C * Use extra pointer register for `up' to speed up feed-in loads. +C * Work out final differences with addmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') +define(`cy', `r36') C for mpn_mul_1c + +ASM_START() +PROLOGUE(mpn_mul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mfi + adds r15 = -1, n C M I + mov f9 = f0 C F + mov.i r2 = ar.lc C I0 +} +{.mmi + ldf8 f7 = [up], 8 C M + nop.m 0 C M + and r14 = 3, n C M I + ;; +} +.Lcommon: +{.mii + setf.sig f6 = vl C M2 M3 + shr.u r31 = r15, 2 C I0 + cmp.eq p10, p0 = 0, r14 C M I +} +{.mii + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + nop.i 0 C I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: mov r20 = 0 + br.cloop.dptk .grt1 C B + + xma.l f39 = f7, f6, f9 C F + xma.hu f43 = f7, f6, f9 C F + ;; + getf.sig r8 = f43 C M2 + stf8 [rp] = f39 C M2 M3 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B + +.grt1: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f7, f6, f9 + xma.hu f43 = f7, f6, f9 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt5 + + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + stf8 [rp] = f39, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + br .Lcj5 + +.grt5: + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + br .LL01 + + +.Lb10: ldf8 f35 = [up], 8 + mov r23 = 0 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + stf8 [rp] = f38, 8 + xma.l f39 = f35, f6, f42 + xma.hu f43 = f35, f6, f42 + ;; + getf.sig r8 = f43 + stf8 [rp] = f39 + mov.i ar.lc = r2 + br.ret.sptk.many b0 + + +.grt2: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt6 + + stf8 [rp] = f38, 8 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + br .Lcj6 + +.grt6: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + br .LL10 + + +.Lb11: ldf8 f34 = [up], 8 + mov r22 = 0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + stf8 [rp] = f37, 8 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + getf.sig r8 = f43 + br .Lcj3 + +.grt3: + ldf8 f32 = [up], 8 + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r19 = f37 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt7 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + getf.sig r21 = f43 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + st8 [rp] = r19, 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + br .Lcj7 + +.grt7: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + br .LL11 + + +.Lb00: ldf8 f33 = [up], 8 + mov r21 = 0 + ;; + ldf8 f34 = [up], 8 + ;; + ldf8 f35 = [up], 8 + xma.l f36 = f7, f6, f9 + xma.hu f40 = f7, f6, f9 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + stf8 [rp] = f36, 8 + xma.l f39 = f35, f6, f0 + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + br .Lcj4 + +.grt4: + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + getf.sig r23 = f41 + xma.hu f43 = f35, f6, f0 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt8 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + st8 [rp] = r18, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + br .Lcj8 + +.grt8: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + br .LL00 + + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; +.LL00: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + nop.b 0 + ldf8 f32 = [up], 8 + (p9) add r24 = r18, r21 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.LL11: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + nop.b 0 + ldf8 f33 = [up], 8 + (p7) add r24 = r19, r22 + nop.b 0 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.LL10: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + nop.b 0 + ldf8 f34 = [up], 8 + (p9) add r24 = r16, r23 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.LL01: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + nop.b 0 + ldf8 f35 = [up], 8 + (p7) add r24 = r17, r20 + br.cloop.dptk .Loop +C *** MAIN LOOP END *** + ;; + +.Lcj9: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.Lcj8: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj7: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj6: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + getf.sig r16 = f38 + st8 [rp] = r24, 8 + ;; +.Lcj5: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + (p8) cmp.leu p6, p7 = r24, r18 + (p9) cmp.ltu p6, p7 = r24, r18 + getf.sig r17 = f39 + st8 [rp] = r24, 8 + ;; +.Lcj4: + .pred.rel "mutex",p6,p7 + getf.sig r8 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r19 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj3: + .pred.rel "mutex",p8,p9 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + st8 [rp] = r24, 8 + (p8) cmp.leu p6, p7 = r24, r16 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj2: + .pred.rel "mutex",p6,p7 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + (p8) add r8 = 1, r8 + mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + adds r15 = -1, n C M I + setf.sig f9 = cy C M2 M3 + mov.i r2 = ar.lc C I0 +} +{.mmb + ldf8 f7 = [up], 8 C M + and r14 = 3, n C M I + br.sptk .Lcommon + ;; +} +EPILOGUE() +ASM_END() -- cgit v1.2.3