From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/README | 281 +++++++++++ gmp-6.3.0/mpn/ia64/add_n_sub_n.asm | 307 ++++++++++++ gmp-6.3.0/mpn/ia64/addmul_1.asm | 602 ++++++++++++++++++++++ gmp-6.3.0/mpn/ia64/addmul_2.asm | 715 +++++++++++++++++++++++++++ gmp-6.3.0/mpn/ia64/aors_n.asm | 852 ++++++++++++++++++++++++++++++++ gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm | 48 ++ gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm | 48 ++ gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm | 412 +++++++++++++++ gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm | 516 +++++++++++++++++++ gmp-6.3.0/mpn/ia64/cnd_aors_n.asm | 264 ++++++++++ gmp-6.3.0/mpn/ia64/copyd.asm | 186 +++++++ gmp-6.3.0/mpn/ia64/copyi.asm | 182 +++++++ gmp-6.3.0/mpn/ia64/dive_1.asm | 236 +++++++++ gmp-6.3.0/mpn/ia64/divrem_1.asm | 477 ++++++++++++++++++ gmp-6.3.0/mpn/ia64/divrem_2.asm | 280 +++++++++++ gmp-6.3.0/mpn/ia64/gcd_11.asm | 110 +++++ gmp-6.3.0/mpn/ia64/gmp-mparam.h | 212 ++++++++ gmp-6.3.0/mpn/ia64/hamdist.asm | 365 ++++++++++++++ gmp-6.3.0/mpn/ia64/ia64-defs.m4 | 147 ++++++ gmp-6.3.0/mpn/ia64/invert_limb.asm | 105 ++++ gmp-6.3.0/mpn/ia64/logops_n.asm | 292 +++++++++++ gmp-6.3.0/mpn/ia64/lorrshift.asm | 358 ++++++++++++++ gmp-6.3.0/mpn/ia64/lshiftc.asm | 463 +++++++++++++++++ gmp-6.3.0/mpn/ia64/mod_34lsub1.asm | 237 +++++++++ gmp-6.3.0/mpn/ia64/mode1o.asm | 342 +++++++++++++ gmp-6.3.0/mpn/ia64/mul_1.asm | 584 ++++++++++++++++++++++ gmp-6.3.0/mpn/ia64/mul_2.asm | 625 +++++++++++++++++++++++ gmp-6.3.0/mpn/ia64/popcount.asm | 200 ++++++++ gmp-6.3.0/mpn/ia64/rsh1aors_n.asm | 447 +++++++++++++++++ gmp-6.3.0/mpn/ia64/sec_tabselect.asm | 148 ++++++ gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm | 156 ++++++ gmp-6.3.0/mpn/ia64/submul_1.asm | 647 ++++++++++++++++++++++++ 32 files changed, 10844 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/README create mode 100644 gmp-6.3.0/mpn/ia64/add_n_sub_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/ia64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/copyd.asm create mode 100644 gmp-6.3.0/mpn/ia64/copyi.asm create mode 100644 gmp-6.3.0/mpn/ia64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/ia64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/ia64/hamdist.asm create mode 100644 gmp-6.3.0/mpn/ia64/ia64-defs.m4 create mode 100644 gmp-6.3.0/mpn/ia64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/ia64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/lorrshift.asm create mode 100644 gmp-6.3.0/mpn/ia64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/ia64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/ia64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/ia64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/mul_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/popcount.asm create mode 100644 gmp-6.3.0/mpn/ia64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/ia64/submul_1.asm (limited to 'gmp-6.3.0/mpn/ia64') diff --git a/gmp-6.3.0/mpn/ia64/README b/gmp-6.3.0/mpn/ia64/README new file mode 100644 index 0000000..45c2d63 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/README @@ -0,0 +1,281 @@ +Copyright 2000-2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + IA-64 MPN SUBROUTINES + + +This directory contains mpn functions for the IA-64 architecture. + + +CODE ORGANIZATION + + mpn/ia64 itanium-2, and generic ia64 + +The code here has been optimized primarily for Itanium 2. Very few Itanium 1 +chips were ever sold, and Itanium 2 is more powerful, so the latter is what +we concentrate on. + + + +CHIP NOTES + +The IA-64 ISA keeps instructions three and three in 128 bit bundles. +Programmers/compilers need to put explicit breaks `;;' when there are WAW or +RAW dependencies, with some notable exceptions. Such "breaks" are typically +at the end of a bundle, but can be put between operations within some bundle +types too. + +The Itanium 1 and Itanium 2 implementations can under ideal conditions +execute two bundles per cycle. The Itanium 1 allows 4 of these instructions +to do integer operations, while the Itanium 2 allows all 6 to be integer +operations. + +Taken cloop branches seem to insert a bubble into the pipeline most of the +time on Itanium 1. + +Loads to the fp registers bypass the L1 cache and thus get extremely long +latencies, 9 cycles on the Itanium 1 and 6 cycles on the Itanium 2. + +The software pipeline stuff using br.ctop instruction causes delays, since +many issue slots are taken up by instructions with zero predicates, and +since many extra instructions are needed to set things up. These features +are clearly designed for code density, not speed. + +Misc pipeline limitations (Itanium 1): +* The getf.sig instruction can only execute in M0. +* At most four integer instructions/cycle. +* Nops take up resources like any plain instructions. + +Misc pipeline limitations (Itanium 2): +* The getf.sig instruction can only execute in M0. +* Nops take up resources like any plain instructions. + + +ASSEMBLY SYNTAX + +.align pads with nops in a text segment, but gas 2.14 and earlier +incorrectly byte-swaps its nop bundle in big endian mode (eg. hpux), making +it come out as break instructions. We use the ALIGN() macro in +mpn/ia64/ia64-defs.m4 when it might be executed across. That macro +suppresses any .align if the problem is detected by configure. Lack of +alignment might hurt performance but will at least be correct. + +foo:: to create a global symbol is not accepted by gas. Use separate +".global foo" and "foo:" instead. + +.global is the standard global directive. gas accepts .globl, but hpux "as" +doesn't. + +.proc / .endp generates the appropriate .type and .size information for ELF, +so the latter directives don't need to be given explicitly. + +.pred.rel "mutex"... is standard for annotating predicate register +relationships. gas also accepts .pred.rel.mutex, but hpux "as" doesn't. + +.pred directives can't be put on a line with a label, like +".Lfoo: .pred ...", the HP assembler on HP-UX 11.23 rejects that. +gas is happy with it, and past versions of HP had seemed ok. + +// is the standard comment sequence, but we prefer "C" since it inhibits m4 +macro expansion. See comments in ia64-defs.m4. + + +REGISTER USAGE + +Special: + r0: constant 0 + r1: global pointer (gp) + r8: return value + r12: stack pointer (sp) + r13: thread pointer (tp) +Caller-saves: r8-r11 r14-r31 f6-f15 f32-f127 +Caller-saves but rotating: r32- + + +================================================================ +mpn_add_n, mpn_sub_n: + +The current code runs at 1.25 c/l on Itanium 2. + +================================================================ +mpn_mul_1: + +The current code runs at 2 c/l on Itanium 2. + +Using a blocked approach, working off of 4 separate places in the operands, +one could make use of the xma accumulation, and approach 1 c/l. + + ldf8 [up] + xma.l + xma.hu + stf8 [wrp] + +================================================================ +mpn_addmul_1: + +The current code runs at 2 c/l on Itanium 2. + +It seems possible to use a blocked approach, as with mpn_mul_1. We should +read rp[] to integer registers, allowing for just one getf.sig per cycle. + + ld8 [rp] + ldf8 [up] + xma.l + xma.hu + getf.sig + add+add+cmp+cmp + st8 [wrp] + +These 10 instructions can be scheduled to approach 1.667 cycles, and with +the 4 cycle latency of xma, this means we need at least 3 blocks. Using +ldfp8 we could approach 1.583 c/l. + +================================================================ +mpn_submul_1: + +The current code runs at 2.25 c/l on Itanium 2. Getting to 2 c/l requires +ldfp8 with all alignment headache that implies. + +================================================================ +mpn_addmul_N + +For best speed, we need to give up using mpn_addmul_2 as the main multiply +building block, and instead take multiple v limbs per loop. For the Itanium +1, we need to take about 8 limbs at a time for full speed. For the Itanium +2, something like mpn_addmul_4 should be enough. + +The add+cmp+cmp+add we use on the other codes is optimal for shortening +recurrencies (1 cycle) but the sequence takes up 4 execution slots. When +recurrency depth is not critical, a more standard 3-cycle add+cmp+add is +better. + +/* First load the 8 values from v */ + ldfp8 v0, v1 = [r35], 16;; + ldfp8 v2, v3 = [r35], 16;; + ldfp8 v4, v5 = [r35], 16;; + ldfp8 v6, v7 = [r35], 16;; + +/* In the inner loop, get a new U limb and store a result limb. */ + mov lc = un +Loop: ldf8 u0 = [r33], 8 + ld8 r0 = [r32] + xma.l lp0 = v0, u0, hp0 + xma.hu hp0 = v0, u0, hp0 + xma.l lp1 = v1, u0, hp1 + xma.hu hp1 = v1, u0, hp1 + xma.l lp2 = v2, u0, hp2 + xma.hu hp2 = v2, u0, hp2 + xma.l lp3 = v3, u0, hp3 + xma.hu hp3 = v3, u0, hp3 + xma.l lp4 = v4, u0, hp4 + xma.hu hp4 = v4, u0, hp4 + xma.l lp5 = v5, u0, hp5 + xma.hu hp5 = v5, u0, hp5 + xma.l lp6 = v6, u0, hp6 + xma.hu hp6 = v6, u0, hp6 + xma.l lp7 = v7, u0, hp7 + xma.hu hp7 = v7, u0, hp7 + getf.sig l0 = lp0 + getf.sig l1 = lp1 + getf.sig l2 = lp2 + getf.sig l3 = lp3 + getf.sig l4 = lp4 + getf.sig l5 = lp5 + getf.sig l6 = lp6 + add+cmp+add xx, l0, r0 + add+cmp+add acc0, acc1, l1 + add+cmp+add acc1, acc2, l2 + add+cmp+add acc2, acc3, l3 + add+cmp+add acc3, acc4, l4 + add+cmp+add acc4, acc5, l5 + add+cmp+add acc5, acc6, l6 + getf.sig acc6 = lp7 + st8 [r32] = xx, 8 + br.cloop Loop + + 49 insn at max 6 insn/cycle: 8.167 cycles/limb8 + 11 memops at max 2 memops/cycle: 5.5 cycles/limb8 + 16 fpops at max 2 fpops/cycle: 8 cycles/limb8 + 21 intops at max 4 intops/cycle: 5.25 cycles/limb8 + 11+21 memops+intops at max 4/cycle 8 cycles/limb8 + +================================================================ +mpn_lshift, mpn_rshift + +The current code runs at 1 cycle/limb on Itanium 2. + +Using 63 separate loops, we could use the double-word shrp instruction. +That instruction has a plain single-cycle latency. We need 63 loops since +this instruction only accept immediate count. That would lead to a somewhat +silly code size, but the speed would be 0.75 c/l on Itanium 2 (by using shrp +each cycle plus shl/shr going down I1 for a further limb every second +cycle). + +================================================================ +mpn_copyi, mpn_copyd + +The current code runs at 0.5 c/l on Itanium 2. But that is just for L1 +cache hit. The 4-way unrolled loop takes just 2 cycles, and thus load-use +scheduling isn't great. It might be best to actually use modulo scheduled +loops, since that will allow us to do better load-use scheduling without too +much unrolling. + +Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium +2, according to tune/speed. Cache bank conflicts? + + + +REFERENCES + +Intel Itanium Architecture Software Developer's Manual, volumes 1 to 3, +Intel document 245317-004, 245318-004, 245319-004 October 2002. Volume 1 +includes an Itanium optimization guide. + +Intel Itanium Processor-specific Application Binary Interface (ABI), Intel +document 245370-003, May 2001. Describes C type sizes, dynamic linking, +etc. + +Intel Itanium Architecture Assembly Language Reference Guide, Intel document +248801-004, 2000-2002. Describes assembly instruction syntax and other +directives. + +Itanium Software Conventions and Runtime Architecture Guide, Intel document +245358-003, May 2001. Describes calling conventions, including stack +unwinding requirements. + +Intel Itanium Processor Reference Manual for Software Optimization, Intel +document 245473-003, November 2001. + +Intel Itanium-2 Processor Reference Manual for Software Development and +Optimization, Intel document 251110-003, May 2004. + +All the above documents can be found online at + + http://developer.intel.com/design/itanium/manuals.htm diff --git a/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm b/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm new file mode 100644 index 0000000..c15afaa --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm @@ -0,0 +1,307 @@ +dnl IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2.25 + +C INPUT PARAMETERS +define(`sp', `r32') +define(`dp', `r33') +define(`up', `r34') +define(`vp', `r35') +define(`n', `r36') + +C Some useful aliases for registers we use +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') +define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27') +define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31') +define(`up0',`up') +define(`up1',`r14') +define(`vp0',`vp') +define(`vp1',`r15') + + +ASM_START() +PROLOGUE(mpn_add_n_sub_n) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 sp = 0, sp C M I + addp4 dp = 0, dp C M I + nop.i 0 + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + zxt4 n = n C I + ;; +') + + and r9 = 3, n C M I + mov.i r2 = ar.lc C I0 + add up1 = 8, up0 C M I + add vp1 = 8, vp0 C M I + add r8 = -2, n C M I + add r10 = 256, up C M I + ;; + shr.u r8 = r8, 2 C I0 + cmp.eq p10, p0 = 0, r9 C M I + cmp.eq p11, p0 = 2, r9 C M I + cmp.eq p12, p0 = 3, r9 C M I + add r11 = 256, vp C M I + ;; + mov.i ar.lc = r8 C I0 + (p10) br L(b0) C B + (p11) br L(b2) C B + (p12) br L(b3) C B + +L(b1): ld8 u3 = [up0], 8 C M01 + add up1 = 8, up1 C M I + cmpltu p14, p15 = 4, n C M I + ld8 v3 = [vp0], 8 C M01 + add vp1 = 8, vp1 C M I + ;; + add s3 = u3, v3 C M I + sub d3 = u3, v3 C M I + mov r8 = 0 C M I + ;; + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + (p15) br L(cj1) C B + st8 [sp] = s3, 8 C M23 + st8 [dp] = d3, 8 C M23 + br L(c0) C B + +L(b0): cmp.ne p9, p0 = r0, r0 C M I + cmp.ne p13, p0 = r0, r0 C M I +L(c0): ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + ;; + ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + ;; + add s0 = u0, v0 C M I + add s1 = u1, v1 C M I + sub d0 = u0, v0 C M I + sub d1 = u1, v1 C M I + ;; + cmpltu p6, p0 = s0, v0 C carry from add0 M I + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p10, p0 = u0, v0 C borrow from sub0 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + nop 0 C + br.cloop.dptk L(top) C B + br L(end) C B + +L(b3): ld8 u1 = [up0], 8 C M01 + add up1 = 8, up1 C M I + ld8 v1 = [vp0], 8 C M01 + ;; + add vp1 = 8, vp1 C M I + add s1 = u1, v1 C M I + sub d1 = u1, v1 C M I + ;; + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + br L(c2) C B + + ALIGN(32) +L(b2): cmp.ne p7, p0 = r0, r0 C M I + cmp.ne p11, p0 = r0, r0 C M I + nop 0 +L(c2): ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + cmpltu p14, p0 = 4, n C M I + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + (p14) br L(gt4) C B + ;; + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + cmpltu p8, p0 = s2, v2 C carry from add0 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + br L(cj2) C B + ;; +L(gt4): ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + ;; + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + cmpltu p8, p0 = s2, v2 C carry from add0 M I + cmpltu p9, p0 = s3, v3 C carry from add1 M I + cmpltu p12, p0 = u2, v2 C borrow from sub0 M I + cmpltu p13, p0 = u3, v3 C borrow from sub1 M I + br.cloop.dptk L(mid) C B + + ALIGN(32) +L(top): + ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + (p9) cmpeqor p6, p0 = -1, s0 C M I + (p9) add s0 = 1, s0 C M I + (p13) cmpeqor p10, p0 = 0, d0 C M I + (p13) add d0 = -1, d0 C M I + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + (p6) cmpeqor p7, p0 = -1, s1 C M I + (p6) add s1 = 1, s1 C M I + (p10) cmpeqor p11, p0 = 0, d1 C M I + (p10) add d1 = -1, d1 C M I + ;; + st8 [sp] = s0, 8 C M23 + st8 [dp] = d0, 8 C M23 + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + cmpltu p8, p0 = s2, v2 C carry from add2 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + ;; +L(mid): + ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + (p7) cmpeqor p8, p0 = -1, s2 C M I + (p7) add s2 = 1, s2 C M I + (p11) cmpeqor p12, p0 = 0, d2 C M I + (p11) add d2 = -1, d2 C M I + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + (p8) cmpeqor p9, p0 = -1, s3 C M I + (p8) add s3 = 1, s3 C M I + (p12) cmpeqor p13, p0 = 0, d3 C M I + (p12) add d3 = -1, d3 C M I + ;; + st8 [sp] = s2, 8 C M23 + st8 [dp] = d2, 8 C M23 + add s0 = u0, v0 C M I + add s1 = u1, v1 C M I + sub d0 = u0, v0 C M I + sub d1 = u1, v1 C M I + ;; + st8 [sp] = s3, 8 C M23 + st8 [dp] = d3, 8 C M23 + cmpltu p6, p0 = s0, v0 C carry from add0 M I + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p10, p0 = u0, v0 C borrow from sub0 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + lfetch [r10], 32 C M? + lfetch [r11], 32 C M? + br.cloop.dptk L(top) C B + ;; + +L(end): + nop 0 + nop 0 + (p9) cmpeqor p6, p0 = -1, s0 C M I + (p9) add s0 = 1, s0 C M I + (p13) cmpeqor p10, p0 = 0, d0 C M I + (p13) add d0 = -1, d0 C M I + ;; + nop 0 + nop 0 + (p6) cmpeqor p7, p0 = -1, s1 C M I + (p6) add s1 = 1, s1 C M I + (p10) cmpeqor p11, p0 = 0, d1 C M I + (p10) add d1 = -1, d1 C M I + ;; + st8 [sp] = s0, 8 C M23 + st8 [dp] = d0, 8 C M23 + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + cmpltu p8, p0 = s2, v2 C carry from add2 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + ;; +L(cj2): + (p7) cmpeqor p8, p0 = -1, s2 C M I + (p7) add s2 = 1, s2 C M I + (p11) cmpeqor p12, p0 = 0, d2 C M I + (p11) add d2 = -1, d2 C M I + mov r8 = 0 C M I + nop 0 + ;; + st8 [sp] = s2, 8 C M23 + st8 [dp] = d2, 8 C M23 + (p8) cmpeqor p9, p0 = -1, s3 C M I + (p8) add s3 = 1, s3 C M I + (p12) cmpeqor p13, p0 = 0, d3 C M I + (p12) add d3 = -1, d3 C M I + ;; +L(cj1): + (p9) mov r8 = 2 C M I + ;; + mov.i ar.lc = r2 C I0 + (p13) add r8 = 1, r8 C M I + st8 [sp] = s3 C M23 + st8 [dp] = d3 C M23 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/addmul_1.asm b/gmp-6.3.0/mpn/ia64/addmul_1.asm new file mode 100644 index 0000000..ffa3297 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/addmul_1.asm @@ -0,0 +1,602 @@ +dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 2.0 + +C TODO +C * Further optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Use 1 c/l carry propagation scheme in wind-down code. +C * Use extra pointer registers for `up' and rp to speed up feed-in loads. +C * Work out final differences with mul_1.asm. That function is 300 bytes +C smaller than this due to better loop scheduling and thus simpler feed-in +C code. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') + +ASM_START() +PROLOGUE(mpn_addmul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + adds r15 = -1, n C M I + mov r20 = rp C M I + mov.i r2 = ar.lc C I0 +} +{.mmi + ldf8 f7 = [up], 8 C M + ldf8 f8 = [rp], 8 C M + and r14 = 3, n C M I + ;; +} +{.mmi + setf.sig f6 = vl C M2 M3 + cmp.eq p10, p0 = 0, r14 C M I + shr.u r31 = r15, 2 C I0 +} +{.mmi + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + nop.i 0 C I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 C B + + xma.l f39 = f7, f6, f8 C F + xma.hu f43 = f7, f6, f8 C F + ;; + getf.sig r8 = f43 C M2 + stf8 [r20] = f39 C M2 M3 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B + +.grt1: + ldf8 f32 = [up], 8 + ldf8 f44 = [rp], 8 + ;; + ldf8 f33 = [up], 8 + ldf8 f45 = [rp], 8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f7, f6, f8 + ldf8 f46 = [rp], 8 + xma.hu f43 = f7, f6, f8 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt5 + + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + stf8 [r20] = f39, 8 + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + getf.sig r24 = f36 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + getf.sig r25 = f37 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + getf.sig r26 = f38 + br .Lcj5 + +.grt5: + mov r30 = 0 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 + ;; + getf.sig r28 = f40 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 + br.cloop.dptk .Loop + br .Le0 + + +.Lb10: ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r30 = f42 + stf8 [r20] = f38, 8 + getf.sig r27 = f39 + getf.sig r8 = f43 + br .Lcj2 + +.grt2: + ldf8 f32 = [up], 8 + ldf8 f44 = [rp], 8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f8 + ldf8 f45 = [rp], 8 + xma.hu f42 = f7, f6, f8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt6 + + stf8 [r20] = f38, 8 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + getf.sig r30 = f42 + getf.sig r27 = f39 + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + getf.sig r24 = f36 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + getf.sig r25 = f37 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + br .Lcj6 + +.grt6: + mov r29 = 0 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 + br .LL10 + + +.Lb11: ldf8 f34 = [up], 8 + ldf8 f46 = [rp], 8 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f37 = f7, f6, f8 + xma.hu f41 = f7, f6, f8 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + stf8 [r20] = f37, 8 + getf.sig r26 = f38 + getf.sig r30 = f42 + getf.sig r27 = f39 + getf.sig r8 = f43 + br .Lcj3 + +.grt3: + ldf8 f32 = [up], 8 + xma.l f37 = f7, f6, f8 + ldf8 f44 = [rp], 8 + xma.hu f41 = f7, f6, f8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 C FIXME + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt7 + + getf.sig r29 = f41 + stf8 [r20] = f37, 8 C FIXME + xma.l f36 = f32, f6, f44 + getf.sig r26 = f38 + xma.hu f40 = f32, f6, f44 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + getf.sig r27 = f39 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + getf.sig r24 = f36 + xma.hu f42 = f34, f6, f46 + br .Lcj7 + +.grt7: + getf.sig r29 = f41 + xma.l f36 = f32, f6, f44 + mov r28 = 0 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + br .LL11 + + +.Lb00: ldf8 f33 = [up], 8 + ldf8 f45 = [rp], 8 + ;; + ldf8 f34 = [up], 8 + ldf8 f46 = [rp], 8 + ;; + ldf8 f35 = [up], 8 + xma.l f36 = f7, f6, f8 + ldf8 f47 = [rp], 8 + xma.hu f40 = f7, f6, f8 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + stf8 [r20] = f36, 8 + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + getf.sig r26 = f38 + getf.sig r30 = f42 + getf.sig r27 = f39 + br .Lcj4 + +.grt4: + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 C FIXME + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt8 + + getf.sig r29 = f41 + stf8 [r20] = f36, 8 C FIXME + xma.l f36 = f32, f6, f44 + getf.sig r26 = f38 + getf.sig r30 = f42 + xma.hu f40 = f32, f6, f44 + ;; + xma.l f37 = f33, f6, f45 + getf.sig r27 = f39 + xma.hu f41 = f33, f6, f45 + br .Lcj8 + +.grt8: + getf.sig r29 = f41 + xma.l f36 = f32, f6, f44 + mov r31 = 0 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + br .LL00 + + +C *** MAIN LOOP START *** + ALIGN(32) C insn fed cycle # +.Loop: + .pred.rel "mutex", p6, p7 C num by i1 i2 + getf.sig r29 = f41 C 00 16 0 0 + xma.l f36 = f32, f6, f44 C 01 06,15 0 0 + (p6) add r14 = r30, r27, 1 C 02 0 0 + ldf8 f47 = [rp], 8 C 03 0 0 + xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 + (p7) add r14 = r30, r27 C 05 0 0 + ;; + .pred.rel "mutex", p6, p7 + ldf8 f32 = [up], 8 C 06 1 1 + (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 + (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 + getf.sig r26 = f38 C 09 25 2 1 + st8 [r20] = r14, 8 C 10 2 1 + nop.b 0 C 11 2 1 + ;; +.LL00: + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C 12 28 3 2 + xma.l f37 = f33, f6, f45 C 13 18,27 3 2 + (p8) add r16 = r31, r24, 1 C 14 3 2 + ldf8 f44 = [rp], 8 C 15 3 2 + xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 + (p9) add r16 = r31, r24 C 17 3 2 + ;; + .pred.rel "mutex", p8, p9 + ldf8 f33 = [up], 8 C 18 4 3 + (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 + (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 + getf.sig r27 = f39 C 21 37 5 3 + st8 [r20] = r16, 8 C 22 5 3 + nop.b 0 C 23 5 3 + ;; +.LL11: + .pred.rel "mutex", p6, p7 + getf.sig r31 = f43 C 24 40 6 4 + xma.l f38 = f34, f6, f46 C 25 30,39 6 4 + (p6) add r14 = r28, r25, 1 C 26 6 4 + ldf8 f45 = [rp], 8 C 27 6 4 + xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 + (p7) add r14 = r28, r25 C 29 6 4 + ;; + .pred.rel "mutex", p6, p7 + ldf8 f34 = [up], 8 C 30 7 5 + (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 + (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 + getf.sig r24 = f36 C 33 01 8 5 + st8 [r20] = r14, 8 C 34 8 5 + nop.b 0 C 35 8 5 + ;; +.LL10: + .pred.rel "mutex", p8, p9 + getf.sig r28 = f40 C 36 04 9 6 + xma.l f39 = f35, f6, f47 C 37 42,03 9 6 + (p8) add r16 = r29, r26, 1 C 38 9 6 + ldf8 f46 = [rp], 8 C 39 9 6 + xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 + (p9) add r16 = r29, r26 C 41 9 6 + ;; + .pred.rel "mutex", p8, p9 + ldf8 f35 = [up], 8 C 42 10 7 + (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 + (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 + getf.sig r25 = f37 C 45 13 11 7 + st8 [r20] = r16, 8 C 46 11 7 + br.cloop.dptk .Loop C 47 11 7 +C *** MAIN LOOP END *** + ;; +.Le0: + .pred.rel "mutex", p6, p7 + getf.sig r29 = f41 C + xma.l f36 = f32, f6, f44 C + (p6) add r14 = r30, r27, 1 C + ldf8 f47 = [rp], 8 C + xma.hu f40 = f32, f6, f44 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + getf.sig r26 = f38 C + st8 [r20] = r14, 8 C + ;; + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C + xma.l f37 = f33, f6, f45 C + (p8) add r16 = r31, r24, 1 C + xma.hu f41 = f33, f6, f45 C + (p9) add r16 = r31, r24 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r24 C + (p9) cmp.ltu p6, p7 = r16, r24 C + getf.sig r27 = f39 C + st8 [r20] = r16, 8 C + ;; +.Lcj8: + .pred.rel "mutex", p6, p7 + getf.sig r31 = f43 C + xma.l f38 = f34, f6, f46 C + (p6) add r14 = r28, r25, 1 C + xma.hu f42 = f34, f6, f46 C + (p7) add r14 = r28, r25 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r25 C + (p7) cmp.ltu p8, p9 = r14, r25 C + getf.sig r24 = f36 C + st8 [r20] = r14, 8 C + ;; +.Lcj7: + .pred.rel "mutex", p8, p9 + getf.sig r28 = f40 C + xma.l f39 = f35, f6, f47 C + (p8) add r16 = r29, r26, 1 C + xma.hu f43 = f35, f6, f47 C + (p9) add r16 = r29, r26 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r26 C + (p9) cmp.ltu p6, p7 = r16, r26 C + getf.sig r25 = f37 C + st8 [r20] = r16, 8 C + ;; +.Lcj6: + .pred.rel "mutex", p6, p7 + getf.sig r29 = f41 C + (p6) add r14 = r30, r27, 1 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + getf.sig r26 = f38 C + st8 [r20] = r14, 8 C + ;; +.Lcj5: + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C + (p8) add r16 = r31, r24, 1 C + (p9) add r16 = r31, r24 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r24 C + (p9) cmp.ltu p6, p7 = r16, r24 C + getf.sig r27 = f39 C + st8 [r20] = r16, 8 C + ;; +.Lcj4: + .pred.rel "mutex", p6, p7 + getf.sig r8 = f43 C + (p6) add r14 = r28, r25, 1 C + (p7) add r14 = r28, r25 C + ;; + .pred.rel "mutex", p6, p7 + st8 [r20] = r14, 8 C + (p6) cmp.leu p8, p9 = r14, r25 C + (p7) cmp.ltu p8, p9 = r14, r25 C + ;; +.Lcj3: + .pred.rel "mutex", p8, p9 + (p8) add r16 = r29, r26, 1 C + (p9) add r16 = r29, r26 C + ;; + .pred.rel "mutex", p8, p9 + st8 [r20] = r16, 8 C + (p8) cmp.leu p6, p7 = r16, r26 C + (p9) cmp.ltu p6, p7 = r16, r26 C + ;; +.Lcj2: + .pred.rel "mutex", p6, p7 + (p6) add r14 = r30, r27, 1 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + st8 [r20] = r14 C + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + ;; + (p8) add r8 = 1, r8 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/addmul_2.asm b/gmp-6.3.0/mpn/ia64/addmul_2.asm new file mode 100644 index 0000000..86e8de4 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/addmul_2.asm @@ -0,0 +1,715 @@ +dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and +dnl add the result to a (n+1)-limb number. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.65 +C Itanium 2: 1.625 + +C TODO +C * Clean up variable names, and try to decrease the number of distinct +C registers used. +C * Clean up feed-in code to not require zeroing several registers. +C * Make sure we don't depend on uninitialised predicate registers. +C * Could perhaps save a few cycles by using 1 c/l carry propagation in +C wind-down code. +C * Ultimately rewrite. The problem with this code is that it first uses a +C loaded u value in one xma pair, then leaves it live over several unrelated +C xma pairs, before it uses it again. It should actually be quite possible +C to just swap some aligned xma pairs around. But we should then schedule +C u loads further from the first use. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`n',`r34') +define(`vp',`r35') + +define(`srp',`r3') + +define(`v0',`f6') +define(`v1',`f7') + +define(`s0',`r14') +define(`acc0',`r15') + +define(`pr0_0',`r16') define(`pr0_1',`r17') +define(`pr0_2',`r18') define(`pr0_3',`r19') + +define(`pr1_0',`r20') define(`pr1_1',`r21') +define(`pr1_2',`r22') define(`pr1_3',`r23') + +define(`acc1_0',`r24') define(`acc1_1',`r25') +define(`acc1_2',`r26') define(`acc1_3',`r27') + +dnl define(`',`r28') +dnl define(`',`r29') +dnl define(`',`r30') +dnl define(`',`r31') + +define(`fp0b_0',`f8') define(`fp0b_1',`f9') +define(`fp0b_2',`f10') define(`fp0b_3',`f11') + +define(`fp1a_0',`f12') define(`fp1a_1',`f13') +define(`fp1a_2',`f14') define(`fp1a_3',`f15') + +define(`fp1b_0',`f32') define(`fp1b_1',`f33') +define(`fp1b_2',`f34') define(`fp1b_3',`f35') + +define(`fp2a_0',`f36') define(`fp2a_1',`f37') +define(`fp2a_2',`f38') define(`fp2a_3',`f39') + +define(`r_0',`f40') define(`r_1',`f41') +define(`r_2',`f42') define(`r_3',`f43') + +define(`u_0',`f44') define(`u_1',`f45') +define(`u_2',`f46') define(`u_3',`f47') + +define(`rx',`f48') +define(`ux',`f49') +define(`ry',`f50') +define(`uy',`f51') + +ASM_START() +PROLOGUE(mpn_addmul_2s) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; ldf8 rx = [rp], 8 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; ldf8 ry = [rp], -8 C M + cmp.eq p14, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; add srp = 16, rp C M I + cmp.eq p15, p0 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p14) br.dptk L(x01) C B + (p11) br.dptk L(x10) C B + (p15) br.dptk L(x11) C B + ;; +} +L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair + mov fp2a_3 = f0 + br L(b00) +L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair + mov fp2a_2 = f0 + br L(b01) +L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair + mov fp2a_1 = f0 + br L(b10) +L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair + mov fp2a_0 = f0 + br L(b11) + +EPILOGUE() + +PROLOGUE(mpn_addmul_2) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; ldf8 rx = [rp], 8 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; ldf8 ry = [rp], -8 C M + cmp.eq p14, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; add srp = 16, rp C M I + cmp.eq p15, p6 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p14) br.dptk L(b01) C B + (p11) br.dptk L(b10) C B + (p15) br.dptk L(b11) C B + ;; +} + ALIGN(32) +L(b00): + {.mmi; ldf8 r_1 = [srp], 8 + ldf8 u_1 = [up], 8 + mov acc1_2 = 0 +}{.mmi; mov pr1_2 = 0 + mov pr0_3 = 0 + cmp.ne p8, p9 = r0, r0 + ;; +}{.mfi; ldf8 r_2 = [srp], 8 + xma.l fp0b_3 = ux, v0, rx + cmp.ne p12, p13 = r0, r0 +}{.mfb; ldf8 u_2 = [up], 8 + xma.hu fp1b_3 = ux, v0, rx + br.cloop.dptk L(gt4) +} + xma.l fp0b_0 = uy, v0, ry + xma.hu fp1a_0 = uy, v0, ry + ;; + getfsig acc0 = fp0b_3 + (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ;; + xma.l fp0b_1 = u_1, v0, r_1 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + getfsig pr1_3 = fp1b_3 + getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, r_2 + xma.hu fp1a_2 = u_2, v0, r_2 + br L(cj4) + +L(gt4): xma.l fp0b_0 = uy, v0, ry + xma.hu fp1a_0 = uy, v0, ry + ;; + ldf8 r_3 = [srp], 8 + getfsig acc0 = fp0b_3 + (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ldf8 u_3 = [up], 8 + (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ;; + xma.l fp0b_1 = u_1, v0, r_1 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; + ldf8 r_0 = [srp], 8 + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + xma.l fp0b_2 = u_2, v0, r_2 + ;; + getfsig acc1_3 = fp2a_3 + xma.hu fp1a_2 = u_2, v0, r_2 + br L(00) + + + ALIGN(32) +L(b01): + {.mmi; ldf8 r_0 = [srp], 8 C M + ldf8 u_0 = [up], 8 C M + mov acc1_1 = 0 C M I +}{.mmi; mov pr1_1 = 0 C M I + mov pr0_2 = 0 C M I + cmp.ne p6, p7 = r0, r0 C M I + ;; +}{.mfi; ldf8 r_1 = [srp], 8 C M + xma.l fp0b_2 = ux, v0, rx C F + cmp.ne p10, p11 = r0, r0 C M I +}{.mfi; ldf8 u_1 = [up], 8 C M + xma.hu fp1b_2 = ux, v0, rx C F + nop 1 + ;; +} xma.l fp0b_3 = uy, v0, ry C F + xma.hu fp1a_3 = uy, v0, ry C F + ;; + {.mmf; getfsig acc0 = fp0b_2 C M + ldf8 r_2 = [srp], 8 C M + (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s +}{.mfb; ldf8 u_2 = [up], 8 C M + (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s + br.cloop.dptk L(gt5) +} + xma.l fp0b_0 = u_0, v0, r_0 C F + xma.hu fp1a_0 = u_0, v0, r_0 C F + ;; + getfsig pr0_3 = fp0b_3 C M + xma.l fp1b_3 = uy, v1,fp1a_3 C F + xma.hu fp2a_3 = uy, v1,fp1a_3 C F + ;; + getfsig pr1_2 = fp1b_2 C M + getfsig acc1_2 = fp2a_2 C M + xma.l fp0b_1 = u_1, v0, r_1 C F + xma.hu fp1a_1 = u_1, v0, r_1 C F + br L(cj5) + +L(gt5): xma.l fp0b_0 = u_0, v0, r_0 + xma.hu fp1a_0 = u_0, v0, r_0 + ;; + getfsig pr0_3 = fp0b_3 + ldf8 r_3 = [srp], 8 + xma.l fp1b_3 = uy, v1, fp1a_3 + xma.hu fp2a_3 = uy, v1, fp1a_3 + ;; + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + xma.l fp0b_1 = u_1, v0, r_1 + ;; + getfsig acc1_2 = fp2a_2 + xma.hu fp1a_1 = u_1, v0, r_1 + br L(01) + + + ALIGN(32) +L(b10): br.cloop.dptk L(gt2) + xma.l fp0b_1 = ux, v0, rx + xma.hu fp1b_1 = ux, v0, rx + ;; + xma.l fp0b_2 = uy, v0, ry + xma.hu fp1a_2 = uy, v0, ry + ;; + stf8 [rp] = fp0b_1, 8 + (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + ;; + getfsig acc0 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + getfsig pr1_1 = fp1b_1 + getfsig acc1_1 = fp2a_1 + mov ar.lc = r2 + getfsig pr1_2 = fp1b_2 + getfsig r8 = fp2a_2 + ;; + add s0 = pr1_1, acc0 + ;; + st8 [rp] = s0, 8 + cmp.ltu p8, p9 = s0, pr1_1 + sub r31 = -1, acc1_1 + ;; + .pred.rel "mutex", p8, p9 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + (p8) cmp.leu p10, p0 = r31, pr1_2 + (p9) cmp.ltu p10, p0 = r31, pr1_2 + ;; + st8 [rp] = acc0, 8 + (p10) add r8 = 1, r8 + br.ret.sptk.many b0 + + +L(gt2): + {.mmi; ldf8 r_3 = [srp], 8 + ldf8 u_3 = [up], 8 + mov acc1_0 = 0 + ;; +}{.mfi; ldf8 r_0 = [srp], 8 + xma.l fp0b_1 = ux, v0, rx + mov pr1_0 = 0 +}{.mfi; ldf8 u_0 = [up], 8 + xma.hu fp1b_1 = ux, v0, rx + mov pr0_1 = 0 + ;; +} xma.l fp0b_2 = uy, v0, ry + xma.hu fp1a_2 = uy, v0, ry + ;; + getfsig acc0 = fp0b_1 + ldf8 r_1 = [srp], 8 + (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + ;; + ldf8 u_1 = [up], 8 + xma.l fp0b_3 = u_3, v0, r_3 + xma.hu fp1a_3 = u_3, v0, r_3 + ;; + getfsig pr0_2 = fp0b_2 + ldf8 r_2 = [srp], 8 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + ;; + {.mfi; getfsig acc1_1 = fp2a_1 + xma.l fp0b_0 = u_0, v0, r_0 + cmp.ne p8, p9 = r0, r0 +}{.mfb; cmp.ne p12, p13 = r0, r0 + xma.hu fp1a_0 = u_0, v0, r_0 + br.cloop.sptk.clr L(top) +} + br.many L(end) + + + ALIGN(32) +L(b11): ldf8 r_2 = [srp], 8 + mov pr1_3 = 0 + mov pr0_0 = 0 + ;; + ldf8 u_2 = [up], 8 + mov acc1_3 = 0 + br.cloop.dptk L(gt3) + ;; + cmp.ne p6, p7 = r0, r0 + xma.l fp0b_0 = ux, v0, rx + xma.hu fp1b_0 = ux, v0, rx + ;; + cmp.ne p10, p11 = r0, r0 + xma.l fp0b_1 = uy, v0, ry + xma.hu fp1a_1 = uy, v0, ry + ;; + getfsig acc0 = fp0b_0 + (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ;; + xma.l fp0b_2 = uy, v1, r_2 + xma.hu fp1a_2 = uy, v1, r_2 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_2, v0, fp1a_1 + xma.hu fp2a_1 = u_2, v0, fp1a_1 + ;; + getfsig pr1_0 = fp1b_0 + getfsig acc1_0 = fp2a_0 + br L(cj3) + +L(gt3): ldf8 r_3 = [srp], 8 + xma.l fp0b_0 = ux, v0, rx + cmp.ne p10, p11 = r0, r0 + ldf8 u_3 = [up], 8 + xma.hu fp1b_0 = ux, v0, rx + cmp.ne p6, p7 = r0, r0 + ;; + xma.l fp0b_1 = uy, v0, ry + xma.hu fp1a_1 = uy, v0, ry + ;; + getfsig acc0 = fp0b_0 + ldf8 r_0 = [srp], 8 + (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ldf8 u_0 = [up], 8 + (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ;; + xma.l fp0b_2 = u_2, v0, r_2 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; + getfsig pr0_1 = fp0b_1 + ldf8 r_1 = [srp], 8 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + ;; + getfsig acc1_0 = fp2a_0 + xma.l fp0b_3 = u_3, v0, r_3 + xma.hu fp1a_3 = u_3, v0, r_3 + br L(11) + + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): C 00 + .pred.rel "mutex", p12, p13 + getfsig pr0_3 = fp0b_3 + ldf8 r_3 = [srp], 8 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + ;; C 01 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + (p8) cmp.leu p6, p7 = acc0, pr0_1 + (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; C 02 + .pred.rel "mutex", p6, p7 + getfsig acc1_2 = fp2a_2 + st8 [rp] = s0, 8 + xma.l fp0b_1 = u_1, v0, r_1 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; C 03 +L(01): + .pred.rel "mutex", p10, p11 + getfsig pr0_0 = fp0b_0 + ldf8 r_0 = [srp], 8 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + ;; C 04 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; C 05 + .pred.rel "mutex", p8, p9 + getfsig acc1_3 = fp2a_3 + st8 [rp] = s0, 8 + xma.l fp0b_2 = u_2, v0, r_2 + (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; C 06 +L(00): + .pred.rel "mutex", p12, p13 + getfsig pr0_1 = fp0b_1 + ldf8 r_1 = [srp], 8 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 + (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + ;; C 07 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + (p8) cmp.leu p6, p7 = acc0, pr0_3 + (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; C 08 + .pred.rel "mutex", p6, p7 + getfsig acc1_0 = fp2a_0 + st8 [rp] = s0, 8 + xma.l fp0b_3 = u_3, v0, r_3 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + xma.hu fp1a_3 = u_3, v0, r_3 + ;; C 09 +L(11): + .pred.rel "mutex", p10, p11 + getfsig pr0_2 = fp0b_2 + ldf8 r_2 = [srp], 8 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 + (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + ;; C 10 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + (p6) cmp.leu p8, p9 = acc0, pr0_0 + (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; C 11 + .pred.rel "mutex", p8, p9 + getfsig acc1_1 = fp2a_1 + st8 [rp] = s0, 8 + xma.l fp0b_0 = u_0, v0, r_0 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + xma.hu fp1a_0 = u_0, v0, r_0 +L(10): br.cloop.sptk.clr L(top) C 12 + ;; +C *** MAIN LOOP END *** +L(end): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 +}{.mfi; (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mfi; getfsig acc1_2 = fp2a_2 + xma.l fp0b_1 = u_1, v0, r_1 + nop 1 +}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; +} +L(cj5): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 +}{.mfi; (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_3 = fp1b_3 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_2 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mfi; getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, r_2 + nop 1 +}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; +} +L(cj4): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 +}{.mfi; (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_0 = fp1b_0 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_3 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig acc1_0 = fp2a_0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + ;; +} +L(cj3): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 +}{.mfi; (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_1 = fp1b_1 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_0 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; getfsig acc1_1 = fp2a_1 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + ;; +} .pred.rel "mutex", p12, p13 + {.mmi; (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig r8 = fp2a_2 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; st8 [rp] = s0, 8 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 + (p9) cmp.ltu p10, p11 = acc0, pr1_2 + (p12) add acc0 = 1, acc0 + ;; +}{.mmi; st8 [rp] = acc0, 8 + (p12) cmpeqor p10, p0 = 0, acc0 + nop 1 + ;; +}{.mib; (p10) add r8 = 1, r8 + mov ar.lc = r2 + br.ret.sptk.many b0 +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/aors_n.asm b/gmp-6.3.0/mpn/ia64/aors_n.asm new file mode 100644 index 0000000..7705ce6 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aors_n.asm @@ -0,0 +1,852 @@ +dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.67 +C Itanium 2: 1.25 + +C TODO +C * Consider using special code for small n, using something like +C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code. +C * The non-nc code was trimmed cycle for cycle to its current state. It is +C probably hard to save more that an odd cycle there. The nc code is much +C cruder (since tune/speed doesn't have any applicable direct measurements). +C * Without the nc entry points, this becomes around 1800 bytes of object +C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a +C few cycles for the non-nc code and let it fall into the nc code. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') +define(`cy', `r36') + +ifdef(`OPERATION_add_n',` + define(ADDSUB, add) + define(CND, ltu) + define(INCR, 1) + define(LIM, -1) + define(LIM2, 0) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) +') +ifdef(`OPERATION_sub_n',` + define(ADDSUB, sub) + define(CND, gtu) + define(INCR, -1) + define(LIM, 0) + define(LIM2, -1) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) +') + +define(PFDIST, 500) + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') +define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') +define(`rpx',`r3') +define(`upadv',`r20') define(`vpadv',`r21') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in + add vpadv = PFDIST, vp C code could save a cycle per call at + mov r23 = cy C the expense of code size. + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lc001 C B + (p7) br.dptk .Lc010 C B + (p8) br.dptk .Lc011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lc100 C B + (p10) br.dptk .Lc101 C B + (p11) br.dptk .Lc110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = 0, cy C copy cy to p13 M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + nop 0 + (p12) br.dptk .Lc111 C B + ;; +} + +.Lc000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v0 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r18 C M I + (p13) cmpeqor p7, p0 = LIM, w1 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + (p13) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) +} + +.Lc001: + {.mmi; (p15) ld8 v1 = [vp], 8 C M01 + (p15) ld8 u1 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmb; nop 0 + nop 0 + (p15) br L(0) + ;; +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + mov r8 = 0 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + br L(cj1) C B +} +L(0): + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; nop 0 + cmp.ne p9, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + add rpx = 16, rp C M I +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br L(c1) C B +} + +.Lc010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov r8 = 0 C M I +}{.mmb; ADDSUB w3 = r10, r11 C M I + cmp.ne p8, p0 = 0, r23 C M I + (p15) br L(1) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + (p8) add w3 = INCR, w3 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, u0 C M I + (p8) cmpeqor p9, p0 = LIM2, w3 C M I + br L(cj2) C B +} +L(1): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ;; +}{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; add rpx = 24, rp C M I + nop 0 + br L(m23) C B +} + +.Lc011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w2 = r10, r11 C M I + cmp.ne p7, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(2) C B +}{.mmi; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + br L(cj3) C B +} +L(2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) +} + +.Lc100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w1 = r10, r11 C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + add rpx = 8, rp C M I +}{.mmi; cmp.ne p6, p0 = 0, r23 C M I + cmp.CND p7, p0 = w1, r10 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I +}{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + (p14) br L(cj4) + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + nop 0 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) +} + +.Lc101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + add rpx = 16, rp C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I +}{.mbb; ADDSUB w1 = u1, v1 C M I + (p15) br L(c5) C B + br L(end) C B +} + +.Lc110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + add vpadv = PFDIST, vp C M I + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = u0, v0 C M I +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + cmp.ne p8, p0 = 0, r23 C M I + add rpx = 24, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 +}{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + br L(m67) C B +} + +.Lc111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + (p13) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmi; ld8 u3 = [up], 8 C M01 + (p13) add w2 = INCR, w2 C M I + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) +} +EPILOGUE() + +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lb001 C B + (p7) br.dptk .Lb010 C B + (p8) br.dptk .Lb011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lb100 C B + (p10) br.dptk .Lb101 C B + (p11) br.dptk .Lb110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = r0, r0 C clear "CF" M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + mov r23 = 0 C M I + (p12) br.dptk .Lb111 C B + ;; +} + +.Lb000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + cmp.CND p8, p0 = w2, r18 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) C B +} + + ALIGN(32) +.Lb001: + {.mmi; ADDSUB w0 = r10, r11 C M I + (p15) ld8 v1 = [vp], 8 C M01 + mov r8 = 0 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, r10 C M I + (p15) ld8 u1 = [up], 8 C M01 + (p14) br L(cj1) C B + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + shr.u n = n, 3 C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I +}{.mmb; ld8 u1 = [up], 8 C M01 + add rpx = 16, rp C M I + br L(m1) C B +} + + ALIGN(32) +.Lb010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmb; ADDSUB w3 = r10, r11 C M I + nop 0 + (p15) br L(gt2) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + mov r8 = 0 C M I + ;; +}{.mmb; nop 0 + cmp.CND p6, p0 = w0, u0 C M I + br L(cj2) C B +} +L(gt2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m23) C B +} + + ALIGN(32) +.Lb011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(3) C B +}{.mmb; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + br L(cj3) C B +} +L(3): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + ADDSUB w3 = u3, v3 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + nop 0 + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) C B +} + + ALIGN(32) +.Lb100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmb; nop 0 + ADDSUB w2 = u2, v2 C M I + (p14) br L(cj4) C B + ;; +} +L(gt4): + {.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) C B +} + + ALIGN(32) +.Lb101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + add rpx = 16, rp C M I +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + nop 0 +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + (p14) br L(cj5) C B + ;; +} +L(gt5): + {.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + mov ar.lc = n C I0 +}{.mmb; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I + br L(m5) C B +} + + ALIGN(32) +.Lb110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m67) C B +} + + ALIGN(32) +.Lb111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + nop 0 +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +L(c5): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m5): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m4): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [upadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m23): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + nop.b 0 + ;; +L(c1): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m1): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m0): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [vpadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m67): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br.cloop.dptk L(top) C B + ;; +C *** MAIN LOOP END *** + +L(end): + {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + mov ar.lc = r2 C I0 +} +L(cj5): + {.mmi; cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w0, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I +} +L(cj4): + {.mmi; cmp.CND p8, p0 = w2, u2 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w1, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I +} +L(cj3): + {.mmi; cmp.CND p9, p0 = w3, u3 C M I + ADDSUB w0 = u0, v0 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I +}{.mmi; cmp.CND p6, p0 = w0, u0 C M I + nop 0 + mov r8 = 0 C M I + ;; +} +L(cj2): + {.mmi; st8 [rp] = w3, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + ;; +} +L(cj1): + {.mmb; st8 [rp] = w0, 8 C M23 + (p6) mov r8 = 1 C M I + br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..9b58b9e --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm @@ -0,0 +1,48 @@ +dnl IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 1.5 + + +define(LSH, 1) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`ia64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..39b384a --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm @@ -0,0 +1,48 @@ +dnl IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 1.5 + + +define(LSH, 2) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`ia64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm new file mode 100644 index 0000000..2703ce2 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm @@ -0,0 +1,412 @@ +dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C TODO +C * Use shladd in feed-in code (for mpn_addlshC_n). +C * Rewrite loop to schedule loads closer to use, since we do prefetch. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') + +ifdef(`DO_add', ` + define(`ADDSUB', `add $1 = $2, $3') + define(`CMP', `cmp.ltu $1,p0 = $2, $3') + define(`INCR', 1) + define(`LIM', -1) + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUB', `sub $1 = $2, $3') + define(`CMP', `cmp.gtu $1,p0 = $2, $3') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUB', `sub $1 = $3, $2') + define(`CMP', `cmp.gtu $1,p0 = $2, $4') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(PFDIST, 500) + +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') +define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') +define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29') +define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9') + +C r3 r8 r9 r10 r11 + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov.i r2 = ar.lc C I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x3 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 +}{.mmb; ADDSUB( w3, r10, x3) C M I + nop 0 + (p15) br.dpnt .grt4 C B + ;; +}{.mii; CMP( p7, w3, r10, x3) C M II0 + shrp x1 = v1, v0, 64-LSH C I0 + ADDSUB( w0, u0, x0) C M I + ;; +}{.mii; CMP( p8, w0, u0, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I +}{.mmb; nop 0 + nop 0 + br .Lcj4 C B +} +ALIGN(32) +.grt4: + {.mii; ld8 v3 = [vp], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 + CMP( p8, w3, r10, x3) C M I + ;; +}{.mmi; ld8 u3 = [up], 8 C M01 + add r11 = PFDIST, vp + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + nop 0 + ;; +}{.mmi; CMP( p6, w0, u0, x0) C M I + add r10 = PFDIST, up + mov.i ar.lc = n C I0 +}{.mmb; ADDSUB( w1, u1, x1) C M I + ld8 u0 = [up], 8 C M01 + br .LL00 C B +} + + ALIGN(32) +.Lb01: +ifdef(`DO_add', +` shladd w2 = r11, LSH, r10 C M I + shr.u r8 = r11, 64-LSH C retval I0 + (p15) br.dpnt .grt1 C B + ;; +',` + shl x2 = r11, LSH C I0 + (p15) br.dpnt .grt1 C B + ;; + ADDSUB( w2, r10, x2) C M I + shr.u r8 = r11, 64-LSH C retval I0 + ;; +') + CMP( p6, w2, r10, x2) C M I + br .Lcj1 + +.grt1: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C FIXME swap with next I0 +ifdef(`DO_add', +`',` + ADDSUB( w2, r10, x2) +') + ;; + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, r11, 64-LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; CMP( p6, w2, r10, x2) C M I + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk .grt5 C B + ;; +}{.mmi; CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmb; nop 0 + nop 0 + br .Lcj5 C B +} +.grt5: + {.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + br .LL01 C B +} + ALIGN(32) +.Lb10: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shl x1 = r11, LSH C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt2 C B + ;; +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + shrp x2 = v2, r11, 64-LSH C I0 + ;; +}{.mmi; CMP( p9, w1, r10, x1) C M I + ADDSUB( w2, u2, x2) C M I + shr.u r8 = v2, 64-LSH C retval I0 + ;; +}{.mmb; CMP( p6, w2, u2, x2) C M I + nop 0 + br .Lcj2 C B +} +.grt2: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + nop 0 + ;; +}{.mii; ld8 v1 = [vp], 8 C M01 + shrp x2 = v2, r11, 64-LSH C I0 + CMP( p8, w1, r10, x1) C M I + ;; +}{.mmi; add r10 = PFDIST, up + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 +}{.mmi; add r11 = PFDIST, vp + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; +}{.mmi; CMP( p6, w2, u2, x2) C M I + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; ADDSUB( w3, u3, x3) C M I + nop 0 + br.cloop.dpnt L(top) C B +} + br L(end) C B +.Lb11: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x0 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt3 C B + ;; +}{.mii; nop 0 + shrp x1 = v1, r11, 64-LSH C I0 + ADDSUB( w0, r10, x0) C M I + ;; +}{.mii; CMP( p8, w0, r10, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I + ;; +}{.mmb; CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + br .Lcj3 C B +} +.grt3: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shrp x1 = v1, r11, 64-LSH C I0 +}{.mmi; ADDSUB( w0, r10, x0) C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + CMP( p6, w0, r10, x0) C M I + mov.i ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB( w1, u1, x1) C M I + nop 0 + ;; +}{.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmb; ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + br .LL11 C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): st8 [rp] = w1, 8 C M23 + lfetch [r10], 32 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + (p8) add w2 = INCR, w2 C M I + ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + ;; +.LL01: ld8 u3 = [up], 8 C M01 + shrp x1 = v1, v0, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + ;; + st8 [rp] = w2, 8 C M23 + CMP( p6, w0, u0, x0) C M I + nop.b 0 + ld8 u0 = [up], 8 C M01 + lfetch [r11], 32 + ADDSUB( w1, u1, x1) C M I + ;; +.LL00: st8 [rp] = w3, 8 C M23 + shrp x2 = v2, v1, 64-LSH C I0 + (p8) cmpeqor p6, p0 = LIM, w0 C M I + (p8) add w0 = INCR, w0 C M I + ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + ;; +.LL11: ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; + {.mmi; st8 [rp] = w0, 8 C M23 + CMP( p6, w2, u2, x2) C M I + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; + ld8 u2 = [up], 8 C M01 + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk L(top) C B + ;; +} +C *** MAIN LOOP END *** + +L(end): + {.mmi; st8 [rp] = w1, 8 C M23 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; + (p8) add w2 = INCR, w2 C M I + CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + ;; +} +.Lcj5: + {.mmi; st8 [rp] = w2, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w3 C M I + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmi; + (p6) add w3 = INCR, w3 C M I + CMP( p8, w0, u0, x0) C M I + ADDSUB( w1, u1, x1) C M I + ;; +} +.Lcj4: + {.mmi; st8 [rp] = w3, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w0 C M I + mov.i ar.lc = r2 C I0 +}{.mmi; + (p7) add w0 = INCR, w0 C M I + CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + ;; +} +.Lcj3: + {.mmi; st8 [rp] = w0, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w1 C M I + shr.u r8 = v2, 64-LSH C I0 +}{.mmi; + (p8) add w1 = INCR, w1 C M I + CMP( p6, w2, u2, x2) C M I + nop 0 + ;; +} +.Lcj2: + {.mmi; st8 [rp] = w1, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +} +.Lcj1: + {.mmb; st8 [rp] = w2 C M23 +ifdef(`DO_rsb',` + (p6) add r8 = -1, r8 C M I +',` + (p6) add r8 = 1, r8 C M I +') br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm new file mode 100644 index 0000000..47e4553 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm @@ -0,0 +1,516 @@ +dnl IA-64 mpn_bdiv_dbm1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4 +C Itanium 2: 2 + +C TODO +C * Optimize feed-in and wind-down code, both for speed and code size. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`bd', `r35') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmb + mov r15 = r36 C M I + ldf8 f9 = [up], 8 C M + nop.b 0 C B +} +.Lcommon: +{.mii + adds r16 = -1, n C M I + mov r2 = ar.lc C I0 + and r14 = 3, n C M I + ;; +} +{.mii + setf.sig f6 = bd C M2 M3 + shr.u r31 = r16, 2 C I0 + cmp.eq p10, p0 = 0, r14 C M I +} +{.mii + nop.m 0 C M + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 + ;; + xma.l f38 = f9, f6, f0 + xma.hu f39 = f9, f6, f0 + ;; + getf.sig r26 = f38 + getf.sig r27 = f39 + br .Lcj1 + +.grt1: ldf8 f10 = [r33], 8 + ;; + ldf8 f11 = [r33], 8 + ;; + ldf8 f12 = [r33], 8 + ;; + xma.l f38 = f9, f6, f0 + xma.hu f39 = f9, f6, f0 + ;; + ldf8 f13 = [r33], 8 + ;; + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt5 + + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + br .Lcj5 + +.grt5: ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + br .LL01 + +.Lb10: ldf8 f13 = [r33], 8 + br.cloop.dptk .grt2 + ;; + + xma.l f36 = f9, f6, f0 + xma.hu f37 = f9, f6, f0 + ;; + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r24 = f36 + ;; + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + ;; + getf.sig r27 = f39 + br .Lcj2 + +.grt2: ldf8 f10 = [r33], 8 + ;; + ldf8 f11 = [r33], 8 + ;; + xma.l f36 = f9, f6, f0 + xma.hu f37 = f9, f6, f0 + ;; + ldf8 f12 = [r33], 8 + ;; + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt6 + + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + br .Lcj6 + +.grt6: getf.sig r25 = f37 + ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + br .LL10 + + +.Lb11: ldf8 f12 = [r33], 8 + ;; + ldf8 f13 = [r33], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f34 = f9, f6, f0 + xma.hu f35 = f9, f6, f0 + ;; + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ;; + getf.sig r24 = f36 + ;; + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + br .Lcj3 + +.grt3: ldf8 f10 = [r33], 8 + ;; + xma.l f34 = f9, f6, f0 + xma.hu f35 = f9, f6, f0 + ;; + ldf8 f11 = [r33], 8 + ;; + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt7 + + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + br .Lcj7 + +.grt7: getf.sig r25 = f37 + ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + br .LL11 + + +.Lb00: ldf8 f11 = [r33], 8 + ;; + ldf8 f12 = [r33], 8 + ;; + ldf8 f13 = [r33], 8 + br.cloop.dptk .grt4 + ;; + + xma.l f32 = f9, f6, f0 + xma.hu f33 = f9, f6, f0 + ;; + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ;; + getf.sig r24 = f36 + br .Lcj4 + +.grt4: xma.l f32 = f9, f6, f0 + xma.hu f33 = f9, f6, f0 + ;; + ldf8 f10 = [r33], 8 + ;; + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .LL00 + br .Lcj8 + +C *** MAIN LOOP START *** + ALIGN(32) +.Ltop: + .pred.rel "mutex",p6,p7 +C .mfi + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + (p6) sub r15 = r19, r27, 1 +C .mfi + st8 [r32] = r19, 8 + xma.hu f33 = f10, f6, f0 + (p7) sub r15 = r19, r27 + ;; +.LL00: +C .mfi + getf.sig r25 = f37 + nop.f 0 + cmp.ltu p6, p7 = r15, r20 +C .mib + ldf8 f10 = [r33], 8 + sub r16 = r15, r20 + nop.b 0 + ;; + +C .mfi + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + (p6) sub r15 = r16, r21, 1 +C .mfi + st8 [r32] = r16, 8 + xma.hu f35 = f11, f6, f0 + (p7) sub r15 = r16, r21 + ;; +.LL11: +C .mfi + getf.sig r27 = f39 + nop.f 0 + cmp.ltu p6, p7 = r15, r22 +C .mib + ldf8 f11 = [r33], 8 + sub r17 = r15, r22 + nop.b 0 + ;; + +C .mfi + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + (p6) sub r15 = r17, r23, 1 +C .mfi + st8 [r32] = r17, 8 + xma.hu f37 = f12, f6, f0 + (p7) sub r15 = r17, r23 + ;; +.LL10: +C .mfi + getf.sig r21 = f33 + nop.f 0 + cmp.ltu p6, p7 = r15, r24 +C .mib + ldf8 f12 = [r33], 8 + sub r18 = r15, r24 + nop.b 0 + ;; + +C .mfi + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + (p6) sub r15 = r18, r25, 1 +C .mfi + st8 [r32] = r18, 8 + xma.hu f39 = f13, f6, f0 + (p7) sub r15 = r18, r25 + ;; +.LL01: +C .mfi + getf.sig r23 = f35 + nop.f 0 + cmp.ltu p6, p7 = r15, r26 +C .mib + ldf8 f13 = [r33], 8 + sub r19 = r15, r26 + br.cloop.sptk.few .Ltop +C *** MAIN LOOP END *** + ;; + + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + (p6) sub r15 = r19, r27, 1 + st8 [r32] = r19, 8 + xma.hu f33 = f10, f6, f0 + (p7) sub r15 = r19, r27 + ;; +.Lcj8: getf.sig r25 = f37 + cmp.ltu p6, p7 = r15, r20 + sub r16 = r15, r20 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + (p6) sub r15 = r16, r21, 1 + st8 [r32] = r16, 8 + xma.hu f35 = f11, f6, f0 + (p7) sub r15 = r16, r21 + ;; +.Lcj7: getf.sig r27 = f39 + cmp.ltu p6, p7 = r15, r22 + sub r17 = r15, r22 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + (p6) sub r15 = r17, r23, 1 + st8 [r32] = r17, 8 + xma.hu f37 = f12, f6, f0 + (p7) sub r15 = r17, r23 + ;; +.Lcj6: getf.sig r21 = f33 + cmp.ltu p6, p7 = r15, r24 + sub r18 = r15, r24 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + (p6) sub r15 = r18, r25, 1 + st8 [r32] = r18, 8 + xma.hu f39 = f13, f6, f0 + (p7) sub r15 = r18, r25 + ;; +.Lcj5: getf.sig r23 = f35 + cmp.ltu p6, p7 = r15, r26 + sub r19 = r15, r26 + ;; + getf.sig r24 = f36 + (p6) sub r15 = r19, r27, 1 + st8 [r32] = r19, 8 + (p7) sub r15 = r19, r27 + ;; +.Lcj4: getf.sig r25 = f37 + cmp.ltu p6, p7 = r15, r20 + sub r16 = r15, r20 + ;; + getf.sig r26 = f38 + (p6) sub r15 = r16, r21, 1 + st8 [r32] = r16, 8 + (p7) sub r15 = r16, r21 + ;; +.Lcj3: getf.sig r27 = f39 + cmp.ltu p6, p7 = r15, r22 + sub r17 = r15, r22 + ;; + (p6) sub r15 = r17, r23, 1 + st8 [r32] = r17, 8 + (p7) sub r15 = r17, r23 + ;; +.Lcj2: cmp.ltu p6, p7 = r15, r24 + sub r18 = r15, r24 + ;; + (p6) sub r15 = r18, r25, 1 + st8 [r32] = r18, 8 + (p7) sub r15 = r18, r25 + ;; +.Lcj1: cmp.ltu p6, p7 = r15, r26 + sub r19 = r15, r26 + ;; + (p6) sub r8 = r19, r27, 1 + st8 [r32] = r19 + (p7) sub r8 = r19, r27 + mov ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm b/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm new file mode 100644 index 0000000..edd0552 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm @@ -0,0 +1,264 @@ +dnl IA-64 mpn_cnd_add_n/mpn_cnd_sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C INPUT PARAMETERS +define(`cnd', `r32') +define(`rp', `r33') +define(`up', `r34') +define(`vp', `r35') +define(`n', `r36') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(CND, ltu) + define(INCR, 1) + define(LIM, -1) + define(func, mpn_cnd_add_n) +') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(CND, gtu) + define(INCR, -1) + define(LIM, 0) + define(func, mpn_cnd_sub_n) +') + +define(PFDIST, 160) + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23') +define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') +define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') +define(`up1',`up') define(`up2',`r8') define(`upadv',`r1') +define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11') +define(`rp1',`rp') define(`rp2',`r10') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; and r3 = 3, n C M I + add n = -1, n C M I + mov r2 = ar.lc C I0 +}{.mmi; cmp.ne p6, p7 = 0, cnd C M I + add vp2 = 8, vp C M I + add up2 = 8, up C M I + ;; +}{.mmi; add upadv = PFDIST, up C M I + add vpadv = PFDIST, vp C M I + shr.u n = n, 2 C I0 + .pred.rel "mutex", p6, p7 +}{.mmi; add rp2 = 8, rp C M I + (p6) mov cnd = -1 C M I + (p7) mov cnd = 0 C M I + ;; +} cmp.eq p9, p0 = 1, r3 C M I + cmp.eq p7, p0 = 2, r3 C M I + cmp.eq p8, p0 = 3, r3 C M I + (p9) br L(b1) C B + (p7) br L(b2) C B + (p8) br L(b3) C B + ;; +L(b0): + {.mmi; ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + mov ar.lc = n C I0 + ;; +} ld8 u2 = [up1], 16 C M01 + ld8 u3 = [up2], 16 C M01 + and x2 = v2, cnd C M I + and x3 = v3, cnd C M I + ;; + ADDSUB w2 = u2, x2 C M I + ADDSUB w3 = u3, x3 C M I + ;; + ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + cmp.CND p8, p0 = w2, u2 C M I + cmp.CND p9, p0 = w3, u3 C M I + br L(lo0) + +L(b1): ld8 v1 = [vp1], 8 C M01 + add vp2 = 8, vp2 C M I + add rp2 = 8, rp2 C M I + ;; + ld8 u1 = [up1], 8 C M01 + add up2 = 8, up2 C M I + and x1 = v1, cnd C M I + ;; + ADDSUB w1 = u1, x1 C M I + cmp.ne p10, p0 = 0, n + add n = -1, n + ;; + cmp.CND p7, p0 = w1, u1 C M I + st8 [rp1] = w1, 8 C M23 + (p10) br L(b0) + ;; + mov r8 = 0 C M I + br L(e1) + +L(b3): ld8 v3 = [vp1], 8 C M01 + add vp2 = 8, vp2 C M I + add rp2 = 8, rp2 C M I + ;; + ld8 u3 = [up1], 8 C M01 + add up2 = 8, up2 C M I + and x3 = v3, cnd C M I + ;; + ADDSUB w3 = u3, x3 C M I + ;; + cmp.CND p9, p0 = w3, u3 C M I + st8 [rp1] = w3, 8 C M23 + C fall through + +L(b2): + {.mmi; ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + mov ar.lc = n C I0 + ;; +} ld8 u0 = [up1], 16 C M01 + ld8 u1 = [up2], 16 C M01 + and x0 = v0, cnd C M I + and x1 = v1, cnd C M I + ;; + ADDSUB w0 = u0, x0 C M I + ADDSUB w1 = u1, x1 C M I + br.cloop.dptk L(gt2) C B + ;; + cmp.CND p6, p0 = w0, u0 C M I + br L(e2) C B +L(gt2): + ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + cmp.CND p6, p0 = w0, u0 C M I + cmp.CND p7, p0 = w1, u1 C M I + br L(lo2) C B + + +C *** MAIN LOOP START *** +C ALIGN(32) +L(top): + {.mmi; ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + cmp.CND p6, p0 = w0, u0 C M I +}{.mmi; st8 [rp1] = w2, 16 C M23 + st8 [rp2] = w3, 16 C M23 + cmp.CND p7, p0 = w1, u1 C M I + ;; +} +L(lo2): + {.mmi; ld8 u2 = [up1], 16 C M01 + ld8 u3 = [up2], 16 C M01 + (p9) cmpeqor p6, p0 = LIM, w0 C M I +}{.mmi; and x2 = v2, cnd C M I + and x3 = v3, cnd C M I + (p9) add w0 = INCR, w0 C M I + ;; +}{.mmi; ADDSUB w2 = u2, x2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I +}{.mmi; ADDSUB w3 = u3, x3 C M I + lfetch [upadv], 32 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + cmp.CND p8, p0 = w2, u2 C M I +}{.mmi; st8 [rp1] = w0, 16 C M23 + st8 [rp2] = w1, 16 C M23 + cmp.CND p9, p0 = w3, u3 C M I + ;; +} +L(lo0): + {.mmi; ld8 u0 = [up1], 16 C M01 + ld8 u1 = [up2], 16 C M01 + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmi; and x0 = v0, cnd C M I + and x1 = v1, cnd C M I + (p7) add w2 = INCR, w2 C M I + ;; +}{.mmi; ADDSUB w0 = u0, x0 C M I + (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I +}{.mmb; ADDSUB w1 = u1, x1 C M I + lfetch [vpadv], 32 + br.cloop.dptk L(top) C B + ;; +} +C *** MAIN LOOP END *** + + +L(end): + {.mmi; st8 [rp1] = w2, 16 C M23 + st8 [rp2] = w3, 16 C M23 + cmp.CND p6, p0 = w0, u0 C M I + ;; +} +L(e2): + {.mmi; cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + ;; +}{.mmi; mov r8 = 0 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + ;; +}{.mmi; st8 [rp1] = w0, 16 C M23 + st8 [rp2] = w1, 16 C M23 + mov ar.lc = r2 C I0 +} +L(e1): + {.mmb; nop 0 + (p7) mov r8 = 1 C M I + br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/copyd.asm b/gmp-6.3.0/mpn/ia64/copyd.asm new file mode 100644 index 0000000..b94a1af --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/copyd.asm @@ -0,0 +1,186 @@ +dnl IA-64 mpn_copyd -- copy limb vector, decrementing. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1 +C Itanium 2: 0.5 + +C INPUT PARAMETERS +C rp = r32 +C sp = r33 +C n = r34 + +ASM_START() +PROLOGUE(mpn_copyd) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + addp4 r33 = 0, r33 + sxt4 r34 = r34 + ;; +') +{.mmi + shladd r32 = r34, 3, r32 + shladd r33 = r34, 3, r33 + mov.i r2 = ar.lc +} +{.mmi + and r14 = 3, r34 + cmp.ge p14, p15 = 3, r34 + add r34 = -4, r34 + ;; +} +{.mmi + cmp.eq p8, p0 = 1, r14 + cmp.eq p10, p0 = 2, r14 + cmp.eq p12, p0 = 3, r14 +} +{.bbb + (p8) br.dptk .Lb01 + (p10) br.dptk .Lb10 + (p12) br.dptk .Lb11 +} + +.Lb00: C n = 0, 4, 8, 12, ... + add r32 = -8, r32 + add r33 = -8, r33 + (p14) br.dptk .Ls00 + ;; + add r21 = -8, r33 + ld8 r16 = [r33], -16 + shr r15 = r34, 2 + ;; + ld8 r17 = [r21], -16 + mov.i ar.lc = r15 + ld8 r18 = [r33], -16 + add r20 = -8, r32 + ;; + ld8 r19 = [r21], -16 + br.cloop.dptk .Loop + ;; + br.sptk .Lend + ;; + +.Lb01: C n = 1, 5, 9, 13, ... + add r21 = -8, r33 + add r20 = -8, r32 + add r33 = -16, r33 + add r32 = -16, r32 + ;; + ld8 r19 = [r21], -16 + shr r15 = r34, 2 + (p14) br.dptk .Ls01 + ;; + ld8 r16 = [r33], -16 + mov.i ar.lc = r15 + ;; + ld8 r17 = [r21], -16 + ld8 r18 = [r33], -16 + br.sptk .Li01 + ;; + +.Lb10: C n = 2,6, 10, 14, ... + add r21 = -16, r33 + shr r15 = r34, 2 + add r20 = -16, r32 + add r32 = -8, r32 + add r33 = -8, r33 + ;; + ld8 r18 = [r33], -16 + ld8 r19 = [r21], -16 + mov.i ar.lc = r15 + (p14) br.dptk .Ls10 + ;; + ld8 r16 = [r33], -16 + ld8 r17 = [r21], -16 + br.sptk .Li10 + ;; + +.Lb11: C n = 3, 7, 11, 15, ... + add r21 = -8, r33 + add r20 = -8, r32 + add r33 = -16, r33 + add r32 = -16, r32 + ;; + ld8 r17 = [r21], -16 + shr r15 = r34, 2 + ;; + ld8 r18 = [r33], -16 + mov.i ar.lc = r15 + ld8 r19 = [r21], -16 + (p14) br.dptk .Ls11 + ;; + ld8 r16 = [r33], -16 + br.sptk .Li11 + ;; + + ALIGN(32) +.Loop: +.Li00: +{.mmb + st8 [r32] = r16, -16 + ld8 r16 = [r33], -16 + nop.b 0 +} +.Li11: +{.mmb + st8 [r20] = r17, -16 + ld8 r17 = [r21], -16 + nop.b 0 + ;; +} +.Li10: +{.mmb + st8 [r32] = r18, -16 + ld8 r18 = [r33], -16 + nop.b 0 +} +.Li01: +{.mmb + st8 [r20] = r19, -16 + ld8 r19 = [r21], -16 + br.cloop.dptk .Loop + ;; +} +.Lend: st8 [r32] = r16, -16 +.Ls11: st8 [r20] = r17, -16 + ;; +.Ls10: st8 [r32] = r18, -16 +.Ls01: st8 [r20] = r19, -16 +.Ls00: mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/copyi.asm b/gmp-6.3.0/mpn/ia64/copyi.asm new file mode 100644 index 0000000..49ed192 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/copyi.asm @@ -0,0 +1,182 @@ +dnl IA-64 mpn_copyi -- copy limb vector, incrementing. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1 +C Itanium 2: 0.5 + +C INPUT PARAMETERS +C rp = r32 +C sp = r33 +C n = r34 + +ASM_START() +PROLOGUE(mpn_copyi) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + addp4 r33 = 0, r33 + sxt4 r34 = r34 + ;; +') +{.mmi + nop 0 + nop 0 + mov.i r2 = ar.lc +} +{.mmi + and r14 = 3, r34 + cmp.ge p14, p15 = 3, r34 + add r34 = -4, r34 + ;; +} +{.mmi + cmp.eq p8, p0 = 1, r14 + cmp.eq p10, p0 = 2, r14 + cmp.eq p12, p0 = 3, r14 +} +{.bbb + (p8) br.dptk .Lb01 + (p10) br.dptk .Lb10 + (p12) br.dptk .Lb11 +} + +.Lb00: C n = 0, 4, 8, 12, ... + (p14) br.dptk .Ls00 + ;; + add r21 = 8, r33 + ld8 r16 = [r33], 16 + shr r15 = r34, 2 + ;; + ld8 r17 = [r21], 16 + mov.i ar.lc = r15 + ld8 r18 = [r33], 16 + add r20 = 8, r32 + ;; + ld8 r19 = [r21], 16 + br.cloop.dptk .Loop + ;; + br.sptk .Lend + ;; + +.Lb01: C n = 1, 5, 9, 13, ... + add r21 = 0, r33 + add r20 = 0, r32 + add r33 = 8, r33 + add r32 = 8, r32 + ;; + ld8 r19 = [r21], 16 + shr r15 = r34, 2 + (p14) br.dptk .Ls01 + ;; + ld8 r16 = [r33], 16 + mov.i ar.lc = r15 + ;; + ld8 r17 = [r21], 16 + ld8 r18 = [r33], 16 + br.sptk .Li01 + ;; + +.Lb10: C n = 2,6, 10, 14, ... + add r21 = 8, r33 + add r20 = 8, r32 + ld8 r18 = [r33], 16 + shr r15 = r34, 2 + ;; + ld8 r19 = [r21], 16 + mov.i ar.lc = r15 + (p14) br.dptk .Ls10 + ;; + ld8 r16 = [r33], 16 + ld8 r17 = [r21], 16 + br.sptk .Li10 + ;; + +.Lb11: C n = 3, 7, 11, 15, ... + add r21 = 0, r33 + add r20 = 0, r32 + add r33 = 8, r33 + add r32 = 8, r32 + ;; + ld8 r17 = [r21], 16 + shr r15 = r34, 2 + ;; + ld8 r18 = [r33], 16 + mov.i ar.lc = r15 + ld8 r19 = [r21], 16 + (p14) br.dptk .Ls11 + ;; + ld8 r16 = [r33], 16 + br.sptk .Li11 + ;; + + ALIGN(32) +.Loop: +.Li00: +{.mmb + st8 [r32] = r16, 16 + ld8 r16 = [r33], 16 + nop.b 0 +} +.Li11: +{.mmb + st8 [r20] = r17, 16 + ld8 r17 = [r21], 16 + nop.b 0 + ;; +} +.Li10: +{.mmb + st8 [r32] = r18, 16 + ld8 r18 = [r33], 16 + nop.b 0 +} +.Li01: +{.mmb + st8 [r20] = r19, 16 + ld8 r19 = [r21], 16 + br.cloop.dptk .Loop + ;; +} +.Lend: st8 [r32] = r16, 16 +.Ls11: st8 [r20] = r17, 16 + ;; +.Ls10: st8 [r32] = r18, 16 +.Ls01: st8 [r20] = r19, 16 +.Ls00: mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/dive_1.asm b/gmp-6.3.0/mpn/ia64/dive_1.asm new file mode 100644 index 0000000..5e4a273 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/dive_1.asm @@ -0,0 +1,236 @@ +dnl IA-64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde. + +dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 16 +C Itanium 2: 8 + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`divisor', `r35') + +define(`lshift', `r24') +define(`rshift', `r25') + +C This code is a bit messy, and not as similar to mode1o.asm as desired. + +C The critical path during initialization is for computing the inverse of the +C divisor. Since odd divisors are probably common, we conditionally execute +C the initial count_trailing_zeros code and the downshift. + +C Possible improvement: Merge more of the feed-in code into the inverse +C computation. + +ASM_START() + .text + .align 32 +.Ltab: +data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF +data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF +data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF +data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF +data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF +data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F +data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F +data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F +data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F +data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F +data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F +data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F +data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F +data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F +data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F +data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF + + +PROLOGUE(mpn_divexact_1) + .prologue + .save ar.lc, r2 + .body + + {.mmi; add r8 = -1, divisor C M0 + nop 0 C M1 + tbit.z p8, p9 = divisor, 0 C I0 +} +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M2 rp extend + addp4 up = 0, up C M3 up extend + sxt4 n = n') C I1 size extend + ;; +.Lhere: + {.mmi; ld8 r20 = [up], 8 C M0 up[0] + (p8) andcm r8 = r8, divisor C M1 + mov r15 = ip C I0 .Lhere + ;; +}{.mii + .pred.rel "mutex", p8, p9 + (p9) mov rshift = 0 C M0 + (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor) + cmp.eq p6, p10 = 1, n C I1 + ;; +}{.mii; add r9 = .Ltab-.Lhere, r15 C M0 + (p8) shr.u divisor = divisor, rshift C I0 + nop 0 C I1 + ;; +}{.mmi; add n = -4, n C M0 size-1 + (p10) ld8 r21 = [up], 8 C M1 up[1] + mov r14 = 2 C M1 2 +}{.mfi; setf.sig f6 = divisor C M2 divisor + mov f9 = f0 C M3 carry FIXME + zxt1 r3 = divisor C I1 divisor low byte + ;; +}{.mmi; add r3 = r9, r3 C M0 table offset ip and index + sub r16 = 0, divisor C M1 -divisor + mov r2 = ar.lc C I0 +}{.mmi; sub lshift = 64, rshift C M2 + setf.sig f13 = r14 C M3 2 in significand + mov r17 = -1 C I1 -1 + ;; +}{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits + nop 0 C M1 + mov ar.lc = n C I0 size-1 loop count +}{.mmi; setf.sig f12 = r16 C M2 -divisor + setf.sig f8 = r17 C M3 -1 + cmp.eq p7, p0 = -2, n C I1 + ;; +}{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits + cmp.eq p8, p0 = -1, n C M0 + shr.u r23 = r20, rshift C I0 + ;; +} + + C f6 divisor + C f7 inverse, being calculated + C f8 -1, will be -inverse + C f9 carry + C f12 -divisor + C f13 2 + C f14 scratch + + xmpy.l f14 = f13, f7 C Newton 2*i + xmpy.l f7 = f7, f7 C Newton i*i + ;; + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits + ;; + setf.sig f10 = r23 C speculative, used iff n = 1 + xmpy.l f14 = f13, f7 C Newton 2*i + shl r22 = r21, lshift C speculative, used iff n > 1 + xmpy.l f7 = f7, f7 C Newton i*i + ;; + or r31 = r22, r23 C speculative, used iff n > 1 + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits + shr.u r23 = r21, rshift C speculative, used iff n > 1 + ;; + setf.sig f11 = r31 C speculative, used iff n > 1 + xmpy.l f14 = f13, f7 C Newton 2*i + xmpy.l f7 = f7, f7 C Newton i*i + ;; + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits + + (p7) br.cond.dptk .Ln2 + (p10) br.cond.dptk .grt3 + ;; + +.Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse + br .Lx1 + +.Ln2: + xmpy.l f8 = f7, f8 C -inverse = inverse * -1 + xmpy.l f12 = f11, f7 C q = ulimb * inverse + setf.sig f11 = r23 + br .Lx2 + +.grt3: + ld8 r21 = [up], 8 C up[2] + xmpy.l f8 = f7, f8 C -inverse = inverse * -1 + ;; + shl r22 = r21, lshift + ;; + xmpy.l f12 = f11, f7 C q = ulimb * inverse + ;; + or r31 = r22, r23 + shr.u r23 = r21, rshift + ;; + setf.sig f11 = r31 + (p8) br.cond.dptk .Lx3 C branch for n = 3 + ;; + ld8 r21 = [up], 8 + br .Lent + +.Ltop: ld8 r21 = [up], 8 + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + nop.b 0 + ;; +.Lent: add r16 = 160, up + shl r22 = r21, lshift + nop.b 0 + ;; + stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + nop.b 0 + nop.m 0 + xmpy.l f10 = f11, f7 C si = ulimb * inverse + nop.b 0 + ;; + or r31 = r22, r23 + shr.u r23 = r21, rshift + nop.b 0 + ;; + lfetch [r16] + setf.sig f11 = r31 + br.cloop.sptk.few.clr .Ltop + + + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx3: stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = ulimb * inverse + ;; + setf.sig f11 = r23 + ;; + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx2: stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = ulimb * inverse + ;; + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx1: stf8 [rp] = f12, 8 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/divrem_1.asm b/gmp-6.3.0/mpn/ia64/divrem_1.asm new file mode 100644 index 0000000..e887820 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/divrem_1.asm @@ -0,0 +1,477 @@ +dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an +dnl unnormalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Itanium: 40-42 +C Itanium 2: 29-30 + +C This was generated by gcc, then the loops were optimized. The preinv entry +C point was shoehorned into the file. Lots of things outside the loops could +C be streamlined. It would probably be a good idea to merge the loops for +C normalized and unnormalized divisor, since the shifting stuff is done for +C free in parallel with other operations. It would even be possible to merge +C all loops, if the ld8 were made conditional. + +C TODO +C * Consider delaying inversion for normalized mpn_divrem_1 entry till after +C computing leading limb. +C * Inline and interleave limb inversion code with loop setup code. + +ASM_START() + +C HP's assembler requires these declarations for importing mpn_invert_limb + .global mpn_invert_limb + .type mpn_invert_limb,@function + +C INPUT PARAMETERS +C rp = r32 +C qxn = r33 +C up = r34 +C n = r35 +C vl = r36 +C vlinv = r37 (preinv only) +C cnt = r38 (preinv only) + +PROLOGUE(mpn_preinv_divrem_1) + .prologue + .save ar.pfs, r42 + alloc r42 = ar.pfs, 7, 8, 1, 0 + .save ar.lc, r44 + mov r44 = ar.lc + .save rp, r41 + mov r41 = b0 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + sxt4 r33 = r33 + addp4 r34 = 0, r34 + sxt4 r35 = r35 + ;; +') + mov r40 = r38 + shladd r34 = r35, 3, r34 + ;; + adds r34 = -8, r34 + ;; + ld8 r39 = [r34], -8 + ;; + + add r15 = r35, r33 + ;; + mov r8 = r37 + shladd r32 = r15, 3, r32 C r32 = rp + n + qxn + cmp.le p8, p0 = 0, r36 + ;; + adds r32 = -8, r32 C r32 = rp + n + qxn - 1 + cmp.leu p6, p7 = r36, r39 + (p8) br.cond.dpnt .Lpunnorm + ;; + + (p6) addl r15 = 1, r0 + (p7) mov r15 = r0 + ;; + (p6) sub r38 = r39, r36 + (p7) mov r38 = r39 + st8 [r32] = r15, -8 + adds r35 = -2, r35 C un -= 2 + br .Lpn + +.Lpunnorm: + (p6) add r34 = 8, r34 + mov r38 = 0 C r = 0 + shl r36 = r36, r40 + (p6) br.cond.dptk .Lpu + ;; + shl r38 = r39, r40 C r = ahigh << cnt + cmp.ne p8, p0 = 1, r35 + st8 [r32] = r0, -8 + adds r35 = -1, r35 C un-- + (p8) br.cond.dpnt .Lpu + + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + br .L435 +EPILOGUE() + + +PROLOGUE(mpn_divrem_1) + .prologue + .save ar.pfs, r42 + alloc r42 = ar.pfs, 5, 8, 1, 0 + .save ar.lc, r44 + mov r44 = ar.lc + .save rp, r41 + mov r41 = b0 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + sxt4 r33 = r33 + addp4 r34 = 0, r34 + sxt4 r35 = r35 + ;; +') + mov r38 = r0 + add r15 = r35, r33 + ;; + cmp.ne p6, p7 = 0, r15 + ;; + (p7) mov r8 = r0 + (p7) br.cond.dpnt .Lret + shladd r14 = r15, 3, r32 C r14 = rp + n + qxn + cmp.le p6, p7 = 0, r36 + ;; + adds r32 = -8, r14 C r32 = rp + n + qxn - 1 + (p6) br.cond.dpnt .Lunnorm + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L179 + shladd r14 = r35, 3, r34 + ;; + adds r14 = -8, r14 + adds r35 = -1, r35 + ;; + ld8 r38 = [r14] + ;; + cmp.leu p6, p7 = r36, r38 + ;; + (p6) addl r15 = 1, r0 + (p7) mov r15 = r0 + ;; + st8 [r32] = r15, -8 + (p6) sub r38 = r38, r36 + +.L179: + mov r45 = r36 + adds r35 = -1, r35 + br.call.sptk.many b0 = mpn_invert_limb + ;; + shladd r34 = r35, 3, r34 +.Lpn: + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + cmp.le p6, p7 = 0, r35 + mov r40 = 0 + (p7) br.cond.dpnt .L435 + setf.sig f10 = r36 + mov ar.lc = r35 + setf.sig f7 = r38 + ;; + sub r28 = -1, r36 +C Develop quotient limbs for normalized divisor +.Loop1: C 00 C q=r18 nh=r38/f7 + ld8 r20 = [r34], -8 + xma.hu f11 = f7, f6, f0 + ;; C 04 + xma.l f8 = f11, f12, f7 C q = q + nh + ;; C 08 + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + xma.l f8 = f8, f10, f0 + ;; C 12 + getf.sig r16 = f9 + C 13 + getf.sig r15 = f8 + ;; C 18 + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; C 19 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; C 20 + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 21 + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; C 22 + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 23 + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 24 + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r38 = r15 + br.cloop.dptk .Loop1 + C 29/30 + br.sptk .L435 + ;; +.Lunnorm: + mux1 r16 = r36, @rev + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L322 + shladd r34 = r35, 3, r34 + ;; + adds r34 = -8, r34 + ;; + ld8 r39 = [r34] + ;; + cmp.leu p6, p7 = r36, r39 + (p6) br.cond.dptk .L322 + adds r34 = -8, r34 + ;; + mov r38 = r39 + ;; + cmp.ne p6, p7 = 1, r15 + st8 [r32] = r0, -8 + ;; + (p7) mov r8 = r38 + (p7) br.cond.dpnt .Lret + adds r35 = -1, r35 +.L322: + sub r14 = r0, r16 + ;; + or r14 = r16, r14 + ;; + mov r16 = -8 + czx1.l r14 = r14 + ;; + shladd r16 = r14, 3, r16 + ;; + shr.u r14 = r36, r16 + ;; + cmp.geu p6, p7 = 15, r14 + ;; + (p7) shr.u r14 = r14, 4 + (p7) adds r16 = 4, r16 + ;; + cmp.geu p6, p7 = 3, r14 + ;; + (p7) shr.u r14 = r14, 2 + (p7) adds r16 = 2, r16 + ;; + tbit.nz p6, p7 = r14, 1 + ;; + .pred.rel "mutex",p6,p7 + (p6) sub r40 = 62, r16 + (p7) sub r40 = 63, r16 + ;; + shl r45 = r36, r40 + shl r36 = r36, r40 + shl r38 = r38, r40 + br.call.sptk.many b0 = mpn_invert_limb + ;; +.Lpu: + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L435 + sub r16 = 64, r40 + adds r35 = -2, r35 + ;; + ld8 r39 = [r34], -8 + cmp.le p6, p7 = 0, r35 + ;; + shr.u r14 = r39, r16 + ;; + or r38 = r14, r38 + (p7) br.cond.dpnt .Lend3 + ;; + mov r22 = r16 + setf.sig f10 = r36 + setf.sig f7 = r38 + mov ar.lc = r35 + ;; +C Develop quotient limbs for unnormalized divisor +.Loop3: + ld8 r14 = [r34], -8 + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + shl r20 = r39, r40 + xma.l f8 = f8, f10, f0 + shr.u r24 = r14, r22 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + or r20 = r24, r20 + ;; + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r39 = r14 + mov r38 = r15 + br.cloop.dptk .Loop3 + ;; +.Lend3: + setf.sig f10 = r36 + setf.sig f7 = r38 + ;; + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + shl r20 = r39, r40 + xma.l f8 = f8, f10, f0 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + ;; + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + ;; + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + cmp.ltu p6, p7 = r15, r36 + ;; + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + st8 [r32] = r18, -8 + mov r38 = r15 +.L435: + adds r35 = -1, r33 + cmp.le p6, p7 = 1, r33 + (p7) br.cond.dpnt .Lend4 + ;; + setf.sig f7 = r38 + setf.sig f10 = r36 + mov ar.lc = r35 + ;; +.Loop4: + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + xma.l f8 = f8, f10, f0 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + ;; + cmp.ltu p6, p7 = 0, r15 + sub r15 = 0, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r38 = r15 + br.cloop.dptk .Loop4 + ;; +.Lend4: + shr.u r8 = r38, r40 +.Lret: + mov ar.pfs = r42 + mov ar.lc = r44 + mov b0 = r41 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/divrem_2.asm b/gmp-6.3.0/mpn/ia64/divrem_2.asm new file mode 100644 index 0000000..9864311 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/divrem_2.asm @@ -0,0 +1,280 @@ +dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm frac +C itanium 1 +C itanium 2 29 29 + + +C TODO +C * Inline and interleave limb inversion code with loop setup code. +C * We should use explicit bundling in much of the code, since it typically +C cuts some cycles with the GNU assembler. + + +ASM_START() + +C HP's assembler requires these declarations for importing mpn_invert_limb + .global mpn_invert_limb + .type mpn_invert_limb,@function + +C INPUT PARAMETERS +C qp = r32 +C fn = r33 +C np = r34 +C nn = r35 +C dp = r36 + +define(`f0x1', `f15') + +ASM_START() +PROLOGUE(mpn_divrem_2) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 C M I + addp4 r34 = 0, r34 C M I + zxt4 r35 = r35 C I + addp4 r36 = 0, r36 C M I + nop.m 0 + zxt4 r33 = r33 C I + ;; +') + .save ar.pfs, r42 + alloc r42 = ar.pfs, 5, 9, 1, 0 + shladd r34 = r35, 3, r34 + adds r14 = 8, r36 + mov r43 = r1 + ;; + adds r15 = -8, r34 + ld8 r39 = [r14] + .save ar.lc, r45 + mov r45 = ar.lc + adds r14 = -16, r34 + mov r40 = r0 + adds r34 = -24, r34 + ;; + ld8 r38 = [r15] + .save rp, r41 + mov r41 = b0 + .body + ld8 r36 = [r36] + ld8 r37 = [r14] + ;; + cmp.gtu p6, p7 = r39, r38 + (p6) br.cond.dptk .L8 + ;; + cmp.leu p8, p9 = r36, r37 + cmp.geu p6, p7 = r39, r38 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p7) br.cond.dptk .L51 +.L8: + add r14 = r33, r35 // un + fn + mov r46 = r39 // argument to mpn_invert_limb + ;; + adds r35 = -3, r14 + ;; + cmp.gt p12, p0 = r0, r35 + (p12) br.cond.dpnt L(end) + br.call.sptk.many b0 = mpn_invert_limb + ;; + setf.sig f11 = r8 // di (non-final) + setf.sig f34 = r39 // d1 + setf.sig f33 = r36 // d0 + mov r1 = r43 + ;; + mov r17 = 1 + setf.sig f9 = r38 // n2 + xma.l f6 = f11, f34, f0 // t0 = LO(di * d1) + ;; + setf.sig f10 = r37 // n1 + setf.sig f15 = r17 // 1 + xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0) + ;; + getf.sig r17 = f6 + getf.sig r16 = f8 + mov ar.lc = r35 + ;; + sub r18 = r0, r39 // -d1 + add r14 = r17, r36 + ;; + setf.sig f14 = r18 // -d1 + cmp.leu p8, p9 = r17, r14 + add r16 = r14, r16 + ;; + (p9) adds r19 = 0, r0 + (p8) adds r19 = -1, r0 + cmp.gtu p6, p7 = r14, r16 + ;; + (p6) adds r19 = 1, r19 + ;; +ifelse(1,1,` + cmp.gt p7, p6 = r0, r19 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) cmp.gt p7, p6 = 1, r19 + (p9) cmp.gt p7, p6 = 0, r19 + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 +',` + cmp.gt p8, p9 = r0, r19 + (p8) br.cond.dpnt .L46 +.L52: + cmp.leu p6, p7 = r39, r16 + sub r14 = r16, r39 + adds r8 = -1, r8 + ;; + (p7) adds r19 = -1, r19 + mov r16 = r14 + ;; + (p7) cmp.gt p8, p9 = r0, r19 + (p9) br.cond.dptk .L52 +.L46: +') + setf.sig f32 = r8 // di + shladd r32 = r35, 3, r32 + ;; + + ALIGN(16) +L(top): nop 0 + nop 0 + cmp.gt p8, p9 = r33, r35 + ;; + (p8) mov r37 = r0 + (p9) ld8 r37 = [r34], -8 + xma.hu f8 = f9, f32, f10 // 0,29 + xma.l f12 = f9, f32, f10 // 0 + ;; + getf.sig r20 = f12 // q0 4 + xma.l f13 = f15, f8, f9 // q += n2 4 + sub r8 = -1, r36 // bitnot d0 + ;; + getf.sig r18 = f13 // 8 + xma.l f7 = f14, f13, f10 // 8 + xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8 + xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9 + ;; + getf.sig r38 = f7 // n1 12 + getf.sig r16 = f6 // 13 + getf.sig r19 = f9 // 14 + ;; + sub r38 = r38, r39 // n1 -= d1 17 + ;; + cmp.ne p9, p0 = r0, r0 // clear p9 + cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18 + ;; + sub r37 = r37, r16 // n0 -= t0 19 + (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19 + (p10) sub r38 = r38, r19 // n1 -= t1 19 + ;; + cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20 + ;; + (p7) cmp.ltu p9, p0 = r8, r37 // 21 + (p6) add r18 = 1, r18 // + (p7) add r37 = r37, r36 // 21 + (p7) add r38 = r38, r39 // 21 + ;; + setf.sig f10 = r37 // n1 22 + (p9) add r38 = 1, r38 // 22 + ;; + setf.sig f9 = r38 // n2 23 + cmp.gtu p6, p7 = r39, r38 // 23 + (p7) br.cond.spnt L(fix) +L(bck): st8 [r32] = r18, -8 + adds r35 = -1, r35 + br.cloop.sptk.few L(top) + ;; + +L(end): add r14 = 8, r34 + add r15 = 16, r34 + mov b0 = r41 + ;; + st8 [r14] = r37 + st8 [r15] = r38 + mov ar.pfs = r42 + mov r8 = r40 + mov ar.lc = r45 + br.ret.sptk.many b0 + ;; +.L51: + .pred.rel "mutex", p8, p9 + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r40 = 1, r0 + br .L8 + ;; + +L(fix): cmp.geu p6, p7 = r39, r38 + cmp.leu p8, p9 = r36, r37 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p6) br.cond.dptk L(bck) + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r18 = 1, r18 + ;; + setf.sig f9 = r38 // n2 + setf.sig f10 = r37 // n1 + br L(bck) + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/gcd_11.asm b/gmp-6.3.0/mpn/ia64/gcd_11.asm new file mode 100644 index 0000000..6137227 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/gcd_11.asm @@ -0,0 +1,110 @@ +dnl Itanium-2 mpn_gcd_11 + +dnl Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bitpair (1x1 gcd) +C Itanium: ? +C Itanium 2: 4.5 + + +ASM_START() + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + .rodata + ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep +ctz_table: + data1 MAXSHIFT +forloop(i,1,MASK, +` data1 m4_count_trailing_zeros(i)-1 +') + +define(`x0', r32) +define(`y0', r33) + +PROLOGUE(mpn_gcd_11) + .prologue + .body + addl r22 = @ltoff(ctz_table), r1 + ;; + ld8 r22 = [r22] + br L(ent) + ;; + + ALIGN(32) +L(top): + .pred.rel "mutex", p6,p7 + {.mmi; (p7) mov y0 = x0 + (p6) sub x0 = x0, y0 + dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits) +}{.mmi; and r20 = MASK, r19 + (p7) mov x0 = r19 + and r23 = 6, r19 + ;; +}{.mmi; cmp.eq p6,p0 = 4, r23 + cmp.eq p7,p0 = 0, r23 + shr.u x0 = x0, 1 C shift-by-1, always OK +}{.mmb; ld1 r16 = [r21] + cmp.eq p10,p0 = 0, r20 + (p10) br.spnt.few.clr L(count_better) + ;; +} +L(bck): + .pred.rel "mutex", p6,p7 + {.mii; nop 0 + (p6) shr.u x0 = x0, 1 C u was ...100 before shift-by-1 above + (p7) shr.u x0 = x0, r16 C u was ...000 before shift-by-1 above + ;; +} +L(ent): + {.mmi; sub r19 = y0, x0 + cmp.gtu p6,p7 = x0, y0 + cmp.ne p8,p0 = x0, y0 +}{.mmb; nop 0 + nop 0 + (p8) br.sptk.few.clr L(top) +} + +L(end): mov r8 = y0 + br.ret.sptk.many b0 + +L(count_better): + add r20 = -1, x0 + ;; + andcm r23 = r20, x0 + ;; + popcnt r16 = r23 + br L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/gmp-mparam.h b/gmp-6.3.0/mpn/ia64/gmp-mparam.h new file mode 100644 index 0000000..34d2bf3 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/gmp-mparam.h @@ -0,0 +1,212 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 900MHz Itanium2 (olympic.gmplib.org) */ +/* FFT tuning limit = 59,194,709 */ +/* Generated by tuneup.c, 2019-10-13, gcc 4.2 */ + +#define MOD_1_1P_METHOD 2 /* 17.40% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 1.35% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 10 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define DIV_1_VS_MUL_1_PERCENT 316 + +#define MUL_TOOM22_THRESHOLD 47 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 220 +#define MUL_TOOM6H_THRESHOLD 327 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 153 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 11 +#define SQR_TOOM2_THRESHOLD 98 +#define SQR_TOOM3_THRESHOLD 135 +#define SQR_TOOM4_THRESHOLD 272 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 490 + +#define MULMID_TOOM42_THRESHOLD 99 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 27 + +#define MUL_FFT_MODF_THRESHOLD 840 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 840, 5}, { 30, 6}, { 16, 5}, { 33, 6}, \ + { 17, 5}, { 36, 6}, { 35, 7}, { 18, 6}, \ + { 37, 7}, { 19, 6}, { 42, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \ + { 71, 9}, { 43,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 191,11}, { 111,12}, { 63,11}, \ + { 143,10}, { 287,11}, { 159,12}, { 95,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 271,12}, \ + { 159,11}, { 335,10}, { 671,12}, { 191,10}, \ + { 799,12}, { 223,13}, { 127,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,13}, { 191,12}, \ + { 383,11}, { 799,10}, { 1599,12}, { 415,11}, \ + { 863,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1119,12}, { 607,13}, { 319,12}, { 735,11}, \ + { 1471,12}, { 863,13}, { 447,12}, { 927,11}, \ + { 1855,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1055,11}, { 2111,12}, { 1119,13}, { 575,12}, \ + { 1247,13}, { 639,12}, { 1311,13}, { 703,12}, \ + { 1471,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2239,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1855,12}, { 3711,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2111,12}, { 4223,13}, { 2175,14}, \ + { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \ + { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 149 +#define MUL_FFT_THRESHOLD 8576 + +#define SQR_FFT_MODF_THRESHOLD 765 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 765, 5}, { 36, 6}, { 37, 7}, { 19, 6}, \ + { 42, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \ + { 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 175,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 399,11}, \ + { 207,13}, { 63,12}, { 127,10}, { 511, 9}, \ + { 1023,10}, { 527,11}, { 271,12}, { 159,10}, \ + { 703,12}, { 191,11}, { 399,10}, { 799,11}, \ + { 431,12}, { 223,13}, { 127,12}, { 255,11}, \ + { 527,10}, { 1055,11}, { 559,12}, { 287,11}, \ + { 607,10}, { 1215,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 799,12}, { 415,11}, { 863,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1055,12}, { 543,11}, { 1119,12}, { 607,11}, \ + { 1215,12}, { 735,13}, { 383,12}, { 799,11}, \ + { 1599,12}, { 863,13}, { 447,12}, { 991,14}, \ + { 255,13}, { 511,12}, { 1055,11}, { 2111,12}, \ + { 1119,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1311,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2239,13}, { 1151,12}, { 2303,13}, \ + { 1215,14}, { 639,13}, { 1279,12}, { 2559,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2239,14}, \ + { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \ + { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 153 +#define SQR_FFT_THRESHOLD 6272 + +#define MULLO_BASECASE_THRESHOLD 39 +#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ +#define MULLO_MUL_N_THRESHOLD 17050 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 134 +#define SQRLO_SQR_THRESHOLD 12322 + +#define DC_DIV_QR_THRESHOLD 73 +#define DC_DIVAPPR_Q_THRESHOLD 262 +#define DC_BDIV_QR_THRESHOLD 111 +#define DC_BDIV_Q_THRESHOLD 315 + +#define INV_MULMOD_BNM1_THRESHOLD 92 +#define INV_NEWTON_THRESHOLD 15 +#define INV_APPR_THRESHOLD 17 + +#define BINV_NEWTON_THRESHOLD 280 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 172 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1566 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 3,22,139,1867 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 42 +#define SET_STR_DC_THRESHOLD 1339 +#define SET_STR_PRECOMPUTE_THRESHOLD 3934 + +#define FAC_DSC_THRESHOLD 866 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 3 /* 13.73% faster than 1 */ +#define HGCD_THRESHOLD 129 +#define HGCD_APPR_THRESHOLD 202 +#define HGCD_REDUCE_THRESHOLD 4455 +#define GCD_DC_THRESHOLD 658 +#define GCDEXT_DC_THRESHOLD 469 +#define JACOBI_BASE_METHOD 2 /* 0.62% faster than 4 */ + +/* Tuneup completed successfully, took 199042 seconds */ diff --git a/gmp-6.3.0/mpn/ia64/hamdist.asm b/gmp-6.3.0/mpn/ia64/hamdist.asm new file mode 100644 index 0000000..477df4c --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/hamdist.asm @@ -0,0 +1,365 @@ +dnl IA-64 mpn_hamdist -- mpn hamming distance. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C INPUT PARAMETERS +define(`up', `r32') +define(`vp', `r33') +define(`n', `r34') + +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') +define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27') +define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') +define(`s',`r8') + + +ASM_START() +PROLOGUE(mpn_hamdist) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r10 = [up], 8 C load first ulimb M01 + ld8 r11 = [vp], 8 C load first vlimb M01 + mov.i r2 = ar.lc C save ar.lc I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C small count? M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + + +.Lb00: ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + xor x0 = r10, r11 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + mov s = 0 C M I + (p15) br.cond.dptk .grt4 C B + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + ;; + popcnt c2 = x2 C I0 + br .Lcj4 C B + +.grt4: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dpnt .grt8 C B + + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + br .Lcj8 C B + +.grt8: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + br .LL00 C B + + +.Lb01: xor x3 = r10, r11 C M I + shr.u n = n, 2 C I0 + (p15) br.cond.dptk .grt1 C B + ;; + popcnt r8 = x3 C I0 + br.ret.sptk.many b0 C B + +.grt1: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x0 = u0, v0 C M I + br.cloop.dpnt .grt5 C B + + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + br .Lcj5 C B + +.grt5: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dpnt .Loop C B + br .Lend C B + + +.Lb10: ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = r10, r11 C M I + (p15) br.cond.dptk .grt2 C B + ;; + xor x3 = u3, v3 C M I + ;; + popcnt c2 = x2 C I0 + ;; + popcnt c3 = x3 C I0 + ;; + add s = c2, c3 C M I + br.ret.sptk.many b0 C B + +.grt2: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x0 = u0, v0 C M I + br.cloop.dptk .grt6 C B + + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + br .Lcj6 C B + +.grt6: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + br .LL10 C B + + +.Lb11: ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + xor x1 = r10, r11 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + (p15) br.cond.dptk .grt3 C B + ;; + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + ;; + popcnt c2 = x2 C I0 + ;; + popcnt c3 = x3 C I0 + ;; + add s = c1, c2 C M I + ;; + add s = s, c3 C M I + br.ret.sptk.many b0 C B + +.grt3: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dptk .grt7 C B + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + br .Lcj7 C B + +.grt7: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + br .LL11 C B + + + ALIGN(32) +.Loop: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + add s = s, c3 C M I + xor x1 = u1, v1 C M I + nop.b 1 C - + ;; +.LL00: ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + add s = s, c0 C M I + xor x2 = u2, v2 C M I + nop.b 1 C - + ;; +.LL11: ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + add s = s, c1 C M I + xor x3 = u3, v3 C M I + nop.b 1 C - + ;; +.LL10: ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + add s = s, c2 C M I + xor x0 = u0, v0 C M I + br.cloop.dptk .Loop C B + ;; + +.Lend: popcnt c2 = x2 C I0 + add s = s, c3 C M I + xor x1 = u1, v1 C M I + ;; +.Lcj8: popcnt c3 = x3 C I0 + add s = s, c0 C M I + xor x2 = u2, v2 C M I + ;; +.Lcj7: popcnt c0 = x0 C I0 + add s = s, c1 C M I + xor x3 = u3, v3 C M I + ;; +.Lcj6: popcnt c1 = x1 C I0 + add s = s, c2 C M I + ;; +.Lcj5: popcnt c2 = x2 C I0 + add s = s, c3 C M I + ;; +.Lcj4: popcnt c3 = x3 C I0 + add s = s, c0 C M I + ;; + add s = s, c1 C M I + ;; + add s = s, c2 C M I + ;; + add s = s, c3 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/ia64-defs.m4 b/gmp-6.3.0/mpn/ia64/ia64-defs.m4 new file mode 100644 index 0000000..f71d280 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/ia64-defs.m4 @@ -0,0 +1,147 @@ +divert(-1) + + +dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl ia64 assembler comments are C++ style "//" to the end of line. gas +dnl also accepts "#" as a comment, if it's the first non-blank on a line. +dnl +dnl BSD m4 can't handle a multi-character comment like "//" (see notes in +dnl mpn/asm-defs.m4). For now the default "#" is left, but with care taken +dnl not to put any macros after "foo#" (since of course they won't expand). + + +define(`ASM_START', +m4_assert_numargs(0) +`') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl 32-byte alignment is used for the benefit of itanium-2, where the code +dnl fetcher will only take 2 bundles from a 32-byte aligned target. At +dnl 16mod32 it only reads 1 in the first cycle. This might not make any +dnl difference if the rotate buffers are full or there's other work holding +dnl up execution, but we use 32-bytes to give the best chance of peak +dnl throughput. +dnl +dnl We can use .align here despite the gas bug noted in mpn/ia64/README, +dnl since we're not expecting to execute across a PROLOGUE(), at least not +dnl currently. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .text + .align 32 + .global $1# + .proc $1# +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) + ` + .endp $1# +') + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Emit a ".align" directive. "bytes" is eval()ed, so can be an +dnl expression. +dnl +dnl This version overrides the definition in mpn/asm-defs.m4. We suppress +dnl any .align if the gas byte-swapped-nops bug was detected by configure +dnl GMP_ASM_IA64_ALIGN_OK. + +define(`ALIGN', +m4_assert_numargs(1) +m4_assert_defined(`IA64_ALIGN_OK') +`ifelse(IA64_ALIGN_OK,no,, +`.align eval($1)')') + + +dnl Usage: ASSERT([pr] [,code]) +dnl +dnl Require that the given predicate register is true after executing the +dnl test code. For example, +dnl +dnl ASSERT(p6, +dnl ` cmp.eq p6,p0 = r3, r4') +dnl +dnl If the predicate register argument is empty then nothing is tested, the +dnl code is just executed. This can be used for setups required by later +dnl ASSERTs. The code argument can be omitted to just test a predicate +dnl with no special setup code. +dnl +dnl For convenience, stops are inserted before and after the code emitted. + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +` ;; +ifelse(`$2',,, +`$2 + ;; +') +ifelse(`$1',,, +`($1) br .LASSERTok`'ASSERT_label_counter ;; + cmp.ne p6,p6 = r0, r0 C illegal instruction + ;; +.LASSERTok`'ASSERT_label_counter: +define(`ASSERT_label_counter',eval(ASSERT_label_counter+1)) +') +')') +define(`ASSERT_label_counter',1) + +define(`getfsig', `getf.sig') +define(`setfsig', `setf.sig') +define(`cmpeq', `cmp.eq') +define(`cmpne', `cmp.ne') +define(`cmpltu', `cmp.ltu') +define(`cmpleu', `cmp.leu') +define(`cmpgtu', `cmp.gtu') +define(`cmpgeu', `cmp.geu') +define(`cmple', `cmp.le') +define(`cmpgt', `cmp.gt') +define(`cmpeqor', `cmp.eq.or') +define(`cmpequc', `cmp.eq.unc') + +divert diff --git a/gmp-6.3.0/mpn/ia64/invert_limb.asm b/gmp-6.3.0/mpn/ia64/invert_limb.asm new file mode 100644 index 0000000..5effdda --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/invert_limb.asm @@ -0,0 +1,105 @@ +dnl IA-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde. + +dnl Copyright 2000, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C d = r32 + +C cycles +C Itanium: 74 +C Itanium 2: 50+6 + +C It should be possible to avoid the xmpy.hu and the following tests by +C explicitly chopping in the last fma. That would save about 10 cycles. + +ASM_START() + .sdata + .align 16 +ifdef(`HAVE_DOUBLE_IEEE_LITTLE_ENDIAN',` +.LC0: data4 0x00000000, 0x80000000, 0x0000403f, 0x00000000 C 2^64 +.LC1: data4 0x00000000, 0x80000000, 0x0000407f, 0x00000000 C 2^128 + +',`ifdef(`HAVE_DOUBLE_IEEE_BIG_ENDIAN',` +.LC0: data4 0x403f8000, 0x00000000, 0x00000000, 0x00000000 C 2^64 +.LC1: data4 0x407f8000, 0x00000000, 0x00000000, 0x00000000 C 2^128 + +',`m4_error(`Oops, need to know float endianness +')')') + + +PROLOGUE(mpn_invert_limb) + C 00 + addl r14 = @gprel(.LC0), gp + addl r15 = @gprel(.LC1), gp + setf.sig f7 = r32 + add r9 = r32, r32 C check for d = 2^63 + ;; C 01 + ldfe f10 = [r14] C 2^64 + ldfe f8 = [r15] C 2^128 + cmp.eq p6, p0 = 0, r9 C check for d = 2^63 + mov r8 = -1 C retval for 2^63 + (p6) br.ret.spnt.many b0 + ;; C 07 + fmpy.s1 f11 = f7, f10 C f11 = d * 2^64 + fnma.s1 f6 = f7, f10, f8 C f6 = 2^128 - d * 2^64 + ;; C 11 + frcpa.s1 f8, p6 = f6, f7 + ;; C 15 + (p6) fnma.s1 f9 = f7, f8, f1 + (p6) fmpy.s1 f10 = f6, f8 + ;; C 19 + (p6) fmpy.s1 f11 = f9, f9 + (p6) fma.s1 f10 = f9, f10, f10 + ;; C 23 + (p6) fma.s1 f8 = f9, f8, f8 + (p6) fma.s1 f9 = f11, f10, f10 + ;; C 27 + (p6) fma.s1 f8 = f11, f8, f8 + (p6) fnma.s1 f10 = f7, f9, f6 + ;; C 31 + (p6) fma.s1 f8 = f10, f8, f9 + ;; C 35 + fcvt.fxu.trunc.s1 f8 = f8 + ;; C 39 + getf.sig r8 = f8 + xmpy.hu f10 = f8, f7 C di * d + ;; C 43 + getf.sig r14 = f10 + andcm r9 = -1, r32 C one's complement + ;; C 48 + cmp.ltu p6, p0 = r9, r14 C got overflow? + ;; C 49 + (p6) add r8 = -1, r8 C adjust di down + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/logops_n.asm b/gmp-6.3.0/mpn/ia64/logops_n.asm new file mode 100644 index 0000000..e4a2f61 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/logops_n.asm @@ -0,0 +1,292 @@ +dnl IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C TODO +C * Use rp,rpx scheme of aors_n.asm to allow parallel stores (useful in +C wind-down code). + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') + +ifdef(`OPERATION_and_n', +` define(`func',`mpn_and_n') + define(`logop', `and $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_andn_n', +` define(`func',`mpn_andn_n') + define(`logop', `andcm $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_nand_n', +` define(`func',`mpn_nand_n') + define(`logop', `and $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_ior_n', +` define(`func',`mpn_ior_n') + define(`logop', `or $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_iorn_n', +` define(`func',`mpn_iorn_n') + define(`logop', `andcm $1 = $3, $2') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_nior_n', +` define(`func',`mpn_nior_n') + define(`logop', `or $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_xor_n', +` define(`func',`mpn_xor_n') + define(`logop', `xor $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logop', `xor $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + nop.m 0 + nop.m 0 + zxt4 n = n C I + ;; +') +{.mmi + ld8 r10 = [up], 8 C M + ld8 r11 = [vp], 8 C M + mov.i r2 = ar.lc C I0 +} +{.mmi + and r14 = 3, n C M I + cmp.lt p15, p14 = 4, n C M I + shr.u n = n, 2 C I0 + ;; +} +{.mmi + cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +} +{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + add n = -2, n C M I + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + (p15) br.cond.dpnt .grt4 C B + + logop( r14, r10, r11) C M I + ;; + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + br .Lcj4 C B + +.grt4: logop( r14, r10, r11) C M I + ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + logop( r15, r17, r21) C M I + ld8 r17 = [up], 8 C M + mov.i ar.lc = n C I0 + notormov( r8, r14) C M I + ld8 r21 = [vp], 8 C M + br .LL00 C B + +.Lb01: add n = -1, n C M I + logop( r15, r10, r11) C M I + (p15) br.cond.dpnt .grt1 C B + ;; + + notormov( r9, r15) C M I + br .Lcj1 C B + +.grt1: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + br.cloop.dptk .grt5 C B + ;; + + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + br .Lcj5 C B + +.grt5: logop( r14, r16, r20) C M I + ld8 r16 = [up], 8 C M + notormov( r9, r15) C M I + ld8 r20 = [vp], 8 C M + br .LL01 C B + +.Lb10: ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + (p15) br.cond.dpnt .grt2 C B + + logop( r14, r10, r11) C M I + ;; + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + br .Lcj2 C B + +.grt2: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + add n = -1, n C M I + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + logop( r14, r10, r11) C M I + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + logop( r15, r19, r23) C M I + ld8 r19 = [up], 8 C M + notormov( r8, r14) C M I + ld8 r23 = [vp], 8 C M + br.cloop.dptk .Loop C B + br .Lcj6 C B + +.Lb11: ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + add n = -1, n C M I + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + logop( r15, r10, r11) C M I + (p15) br.cond.dpnt .grt3 C B + ;; + + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + br .Lcj3 C B + +.grt3: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + logop( r14, r18, r22) C M I + ld8 r18 = [up], 8 C M + notormov( r9, r15) C M I + ld8 r22 = [vp], 8 C M + br .LL11 C B + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: st8 [rp] = r8, 8 C M + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + nop.b 0 + ;; +.LL01: st8 [rp] = r9, 8 C M + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + nop.b 0 + ;; +.LL00: st8 [rp] = r8, 8 C M + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + nop.b 0 + ;; +.LL11: st8 [rp] = r9, 8 C M + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + br.cloop.dptk .Loop ;; C B +C *** MAIN LOOP END *** + +.Lcj6: st8 [rp] = r8, 8 C M + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + ;; +.Lcj5: st8 [rp] = r9, 8 C M + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + ;; +.Lcj4: st8 [rp] = r8, 8 C M + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + ;; +.Lcj3: st8 [rp] = r9, 8 C M + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + ;; +.Lcj2: st8 [rp] = r8, 8 C M + notormov( r9, r15) C M I + ;; +.Lcj1: st8 [rp] = r9, 8 C M + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/lorrshift.asm b/gmp-6.3.0/mpn/ia64/lorrshift.asm new file mode 100644 index 0000000..694aaf0 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/lorrshift.asm @@ -0,0 +1,358 @@ +dnl IA-64 mpn_lshift/mpn_rshift. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C The ld8 scheduling should probably be decreased to make the function smaller. +C Good lfetch will make sure we never stall anyway. + +C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair +C at cycle 2. Judicious use of predicates could allow us to issue more ld8's +C in the prologue. + + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +ifdef(`OPERATION_lshift',` + define(`FSH',`shl') + define(`BSH',`shr.u') + define(`UPD',`-8') + define(`POFF',`-512') + define(`PUPD',`-32') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`FSH',`shr.u') + define(`BSH',`shl') + define(`UPD',`8') + define(`POFF',`512') + define(`PUPD',`32') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + nop.m 0 + nop.m 0 + zxt4 cnt = cnt C I + ;; +') + + {.mmi; cmp.lt p14, p15 = 4, n C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + add r16 = -5, n + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r16, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I +ifdef(`OPERATION_lshift', +` shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp') C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; } + +.Lb00: ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + FSH r24 = r10, cnt + BSH r25 = r19, tnc + (p14) br.cond.dptk .grt4 + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + BSH r23 = r10, tnc + br .Lr4 + +.grt4: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + or r14 = r25, r24 + ld8 r17 = [up], UPD + br.cloop.dpnt .Ltop + br .Lbot + +.Lb01: + (p15) BSH r8 = r10, tnc C function return value I + (p15) FSH r22 = r10, cnt C I + (p15) br.cond.dptk .Lr1 C return B + +.grt1: ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r16 = [up], UPD + FSH r22 = r10, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + FSH r24 = r18, cnt + BSH r25 = r19, tnc + br.cloop.dpnt .grt5 + ;; + or r15 = r23, r22 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + br .Lr5 + +.grt5: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r23, r22 + ld8 r16 = [up], UPD + br .LL01 + + +.Lb10: ld8 r17 = [up], UPD + (p14) br.cond.dptk .grt2 + + BSH r8 = r10, tnc C function return value + ;; + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + or r14 = r21, r20 + FSH r22 = r17, cnt + br .Lr2 C return + +.grt2: ld8 r18 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r19 = [up], UPD + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + {.mmi; ld8 r17 = [up], UPD + or r14 = r21, r20 + FSH r24 = r18, cnt +}{.mib; nop 0 + BSH r25 = r19, tnc + br.cloop.dpnt .grt6 + ;; } + + FSH r26 = r19, cnt + BSH r27 = r16, tnc + br .Lr6 + +.grt6: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + br .LL10 + + +.Lb11: ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + (p14) br.cond.dptk .grt3 + ;; + + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r27, r26 + FSH r22 = r17, cnt + br .Lr3 C return + +.grt3: ld8 r18 = [up], UPD + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + br.cloop.dpnt .grt7 + + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + br .Lr7 + +.grt7: or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ld8 r18 = [up], UPD + br .LL11 + +C *** MAIN LOOP START *** + ALIGN(32) +.Ltop: + {.mmi; st8 [rp] = r14, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M1 + lfetch [r11], PUPD + BSH r25 = r19, tnc C I1 + ;; } +.LL11: + {.mmi; st8 [rp] = r15, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop.m 0 + BSH r27 = r16, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r14, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop.m 0 + BSH r21 = r17, tnc + ;; } +.LL01: + {.mmi; st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk .Ltop + ;; } +C *** MAIN LOOP END *** + +.Lbot: + {.mmi; st8 [rp] = r14, UPD + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; nop 0 + BSH r25 = r19, tnc + nop 0 + ;; } +.Lr7: + {.mmi; st8 [rp] = r15, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; nop 0 + BSH r27 = r16, tnc + nop 0 + ;; } +.Lr6: + {.mmi; st8 [rp] = r14, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; nop 0 + BSH r21 = r17, tnc + nop 0 + ;; } +.Lr5: st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; +.Lr4: st8 [rp] = r14, UPD + or r15 = r27, r26 + ;; +.Lr3: st8 [rp] = r15, UPD + or r14 = r21, r20 + ;; +.Lr2: st8 [rp] = r14, UPD + ;; +.Lr1: st8 [rp] = r22, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/lshiftc.asm b/gmp-6.3.0/mpn/ia64/lshiftc.asm new file mode 100644 index 0000000..e8cec87 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/lshiftc.asm @@ -0,0 +1,463 @@ +dnl IA-64 mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.25 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C The ld8 scheduling should probably be decreased to make the function smaller. +C Good lfetch will make sure we never stall anyway. + +C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair +C at cycle 2. Judicious use of predicates could allow us to issue more ld8's +C in the prologue. + + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +define(`FSH',`shl') +define(`BSH',`shr.u') +define(`UPD',`-8') +define(`POFF',`-512') +define(`PUPD',`-32') +define(`func',`mpn_lshiftc') + +ASM_START() +PROLOGUE(mpn_lshiftc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + nop.m 0 + nop.m 0 + zxt4 cnt = cnt C I + ;; +') + + {.mmi; nop 0 C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + nop 0 + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r15, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I + shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; } + +.Lb00: + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + br.cloop.dptk L(gt4) + ;; + FSH r24 = r10, cnt + BSH r25 = r19, tnc + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + sub r31 = -1, r14 + br .Lr4 + +L(gt4): + {.mmi; nop 0 + nop 0 + FSH r24 = r10, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; nop 0 + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt8) + ;; } + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + br .Lr8 } + +L(gt8): + or r15 = r27, r26 + FSH r24 = r18, cnt + ld8 r18 = [up], UPD + sub r31 = -1, r14 + BSH r25 = r19, tnc + br .LL00 + +.Lb01: + br.cloop.dptk L(gt1) + ;; + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + sub r31 = -1, r22 + br .Lr1 + ;; +L(gt1): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt5) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + nop 0 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + or r15 = r23, r22 + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + br .Lr5 + +L(gt5): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; or r14 = r25, r24 + sub r31 = -1, r15 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br L(end) + ;; } + +.Lb10: + ld8 r17 = [up], UPD + br.cloop.dptk L(gt2) + ;; + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + or r14 = r21, r20 + ;; + sub r31 = -1, r14 + br .Lr2 + ;; +L(gt2): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt6) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + or r14 = r21, r20 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + br .Lr6 + ;; } +L(gt6): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; or r15 = r23, r22 + sub r31 = -1, r14 + FSH r20 = r16, cnt +}{.mib; ld8 r16 = [up], UPD + BSH r21 = r17, tnc + br .LL10 +} + +.Lb11: + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + FSH r26 = r10, cnt + br.cloop.dptk L(gt3) + ;; + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + ;; + or r14 = r21, r20 + sub r31 = -1, r15 + br .Lr3 + ;; +L(gt3): + ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; +}{.mmi; nop 0 + nop 0 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt7) + ;; } + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + br .Lr7 +} +L(gt7): + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; or r14 = r21, r20 + sub r31 = -1, r15 + FSH r26 = r19, cnt +}{.mib; ld8 r19 = [up], UPD + BSH r27 = r16, tnc + br .LL11 +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +.LL01: + {.mmi; st8 [rp] = r31, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M0 + sub r31 = -1, r14 C M1 + BSH r25 = r19, tnc C I1 + ;; } +.LL00: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + sub r31 = -1, r15 + BSH r27 = r16, tnc + ;; } +.LL11: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + sub r31 = -1, r14 + BSH r21 = r17, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mmi; ld8 r17 = [up], UPD + sub r31 = -1, r15 + BSH r23 = r18, tnc + ;; } +L(end): lfetch [r11], PUPD + br.cloop.dptk L(top) +C *** MAIN LOOP END *** + + {.mmi; st8 [rp] = r31, UPD + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + nop 0 + ;; } +.Lr8: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + nop 0 + ;; } +.Lr7: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + nop 0 + ;; } +.Lr6: st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + ;; +.Lr5: st8 [rp] = r31, UPD + or r15 = r27, r26 + sub r31 = -1, r14 + ;; +.Lr4: st8 [rp] = r31, UPD + or r14 = r21, r20 + sub r31 = -1, r15 + ;; +.Lr3: st8 [rp] = r31, UPD + sub r31 = -1, r14 + ;; +.Lr2: st8 [rp] = r31, UPD + sub r31 = -1, r22 + ;; +.Lr1: st8 [rp] = r31, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm new file mode 100644 index 0000000..7789117 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm @@ -0,0 +1,237 @@ +dnl IA-64 mpn_mod_34lsub1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1 + + +C INPUT PARAMETERS +define(`up', `r32') +define(`n', `r33') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') +define(`a0',`r17') define(`a1',`r18') define(`a2',`r19') +define(`c0',`r20') define(`c1',`r21') define(`c2',`r22') + +C This is a fairly simple-minded implementation. One could approach 0.67 c/l +C with a more sophisticated implementation. If we're really crazy, we could +C super-unroll, storing carries just in predicate registers, then copy them to +C a general register, and population count them from there. That'd bring us +C close to 3 insn/limb, for nearly 0.5 c/l. + +C Computing n/3 needs 16 cycles, which is a lot of startup overhead. +C We therefore use a plain while-style loop: +C add n = -3, n +C cmp.le p9, p0 = 3, n +C (p9) br.cond .Loop +C Alternatively, we could table n/3 for, say, n < 256, and predicate the +C 16-cycle code. + +C The summing-up code at the end was written quickly, and could surely be +C vastly improved. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 up = 0, up C M I + nop.m 0 + zxt4 n = n C I + ;; +') + +ifelse(0,1,` + movl r14 = 0xAAAAAAAAAAAAAAAB + ;; + setf.sig f6 = r14 + setf.sig f7 = r33 + ;; + xmpy.hu f6 = f6, f7 + ;; + getf.sig r8 = f6 + ;; + shr.u r8 = r8, 1 C Loop count + ;; + mov.i ar.lc = r8 +') + + ld8 u0 = [up], 8 + cmp.ne p9, p0 = 1, n + (p9) br L(gt1) + ;; + shr.u r8 = u0, 48 + dep.z r27 = u0, 0, 48 + ;; + add r8 = r8, r27 + br.ret.sptk.many b0 + + +L(gt1): + {.mmi; nop.m 0 + mov a0 = 0 + add n = -2, n +}{.mmi; mov c0 = 0 + mov c1 = 0 + mov c2 = 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + mov a1 = 0 + cmp.ltu p6, p0 = r0, r0 C clear p6 +}{.mmb; cmp.gt p9, p0 = 3, n + mov a2 = 0 + (p9) br.cond.dptk L(end) + ;; +} + ALIGN(32) +L(top): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + add n = -3, n + nop.b 0 + ;; +}{.mmi; ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 +}{.mmb; sub a1 = a1, u1 + cmp.le p9, p0 = 3, n + nop.b 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 +}{.mmb; sub a2 = a2, u2 + nop.m 0 +dnl br.cloop.dptk L(top) + (p9) br.cond.dptk L(top) + ;; +} +L(end): + cmp.eq p10, p0 = 0, n + cmp.eq p11, p0 = 1, n + (p10) br L(0) + +L(2): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + nop.m 0 + (p11) br L(1) + ;; +} ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + br L(com) + + +L(1): + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + br L(com) + + +L(0): + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + +L(com): +C | a2 | a1 | a0 | +C | | | | | + shr.u r24 = a0, 48 C 16 bits + shr.u r25 = a1, 32 C 32 bits + shr.u r26 = a2, 16 C 48 bits + ;; + shr.u r10 = c0, 48 C 16 bits, always zero + shr.u r11 = c1, 32 C 32 bits + shr.u r30 = c2, 16 C 48 bits + ;; + dep.z r27 = a0, 0, 48 C 48 bits + dep.z r28 = a1, 16, 32 C 48 bits + dep.z r29 = a2, 32, 16 C 48 bits + dep.z r31 = c0, 0, 48 C 48 bits + dep.z r14 = c1, 16, 32 C 48 bits + dep.z r15 = c2, 32, 16 C 48 bits + ;; + {.mmi; add r24 = r24, r25 + add r26 = r26, r27 + add r28 = r28, r29 +}{.mmi; add r10 = r10, r11 + add r30 = r30, r31 + add r14 = r14, r15 + ;; +} + movl r8 = 0xffffffffffff0 + add r24 = r24, r26 + add r10 = r10, r30 + ;; + add r24 = r24, r28 + add r10 = r10, r14 + ;; + sub r8 = r8, r24 + ;; + add r8 = r8, r10 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mode1o.asm b/gmp-6.3.0/mpn/ia64/mode1o.asm new file mode 100644 index 0000000..14d5e81 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mode1o.asm @@ -0,0 +1,342 @@ +dnl Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder. + +dnl Contributed to the GNU project by Kevin Ryde. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Itanium: 15 +C Itanium 2: 8 + + +dnl Usage: ABI32(`code') +dnl +dnl Emit the given code only under HAVE_ABI_32. +dnl +define(ABI32, +m4_assert_onearg() +`ifdef(`HAVE_ABI_32',`$1')') + + +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C The modexact algorithm is usually conceived as a dependent chain +C +C l = src[i] - c +C q = low(l * inverse) +C c = high(q*divisor) + (src[i]=2), and the +C calculation of q by the initial different scheme. +C +C +C Entry Sequence: +C +C In the entry sequence, the critical path is the calculation of the +C inverse, so this is begun first and optimized. Apart from that, ar.lc is +C established nice and early so the br.cloop's should predict perfectly. +C And the load for the low limbs src[0] and src[1] can be initiated long +C ahead of where they're needed. +C +C +C Inverse Calculation: +C +C The initial 8-bit inverse is calculated using a table lookup. If it hits +C L1 (which is likely if we're called several times) then it should take a +C total 4 cycles, otherwise hopefully L2 for 9 cycles. This is considered +C the best approach, on balance. It could be done bitwise, but that would +C probably be about 14 cycles (2 per bit beyond the first couple). Or it +C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits, +C but that would be about 11 cycles. +C +C The table is not the same as binvert_limb_table, instead it's 256 bytes, +C designed to be indexed by the low byte of the divisor. The divisor is +C always odd, so the relevant data is every second byte in the table. The +C padding lets us use zxt1 instead of extr.u, the latter would cost an extra +C cycle because it must go down I0, and we're using the first I0 slot to get +C ip. The extra 128 bytes of padding should be insignificant compared to +C typical ia64 code bloat. +C +C Having the table in .text allows us to use IP-relative addressing, +C avoiding a fetch from ltoff. .rodata is apparently not suitable for use +C IP-relative, it gets a linker relocation overflow on GNU/Linux. +C +C +C Load Scheduling: +C +C In the main loop, the data loads are scheduled for an L2 hit, which means +C 6 cycles for the data ready to use. In fact we end up 7 cycles ahead. In +C any case that scheduling is achieved simply by doing the load (and xmpy.l +C for "si") in the immediately preceding iteration. +C +C The main loop requires size >= 2, and we handle size==1 by an initial +C br.cloop to enter the loop only if size>1. Since ar.lc is established +C early, this should predict perfectly. +C +C +C Not done: +C +C Consideration was given to using a plain "(src[0]-c) % divisor" for +C size==1, but cycle counting suggests about 50 for the sort of approach +C taken by gcc __umodsi3, versus about 47 for the modexact. (Both assuming +C L1 hits for their respective fetching.) +C +C Consideration was given to a test for high 1 + ;; + + C size==1, finish up now + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + mov ar.lc = r2 C I0 + ;; + getf.sig r8 = f9 C M2 return c + br.ret.sptk.many b0 + + + +.Ltop: + C r2 saved ar.lc + C f6 divisor + C f7 inverse + C f8 -inverse + C f9 carry + C f10 src[i] * inverse + C f11 scratch src[i+1] + + add r16 = 160, r32 + ldf8 f11 = [r32], 8 C src[i+1] + ;; + C 2 cycles + + lfetch [r16] + xma.l f10 = f9, f8, f10 C q = c * -inverse + si + ;; + C 3 cycles + +.Lentry: + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = src[i] * inverse + br.cloop.sptk.few.clr .Ltop + ;; + + + + xma.l f10 = f9, f8, f10 C q = c * -inverse + si + mov ar.lc = r2 C I0 + ;; + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + ;; + getf.sig r8 = f9 C M2 return c + br.ret.sptk.many b0 + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/mul_1.asm b/gmp-6.3.0/mpn/ia64/mul_1.asm new file mode 100644 index 0000000..21bf6d0 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mul_1.asm @@ -0,0 +1,584 @@ +dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and +dnl store the result in a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.0 + +C TODO +C * Further optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Use 1 c/l carry propagation scheme in wind-down code. +C * Use extra pointer register for `up' to speed up feed-in loads. +C * Work out final differences with addmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') +define(`cy', `r36') C for mpn_mul_1c + +ASM_START() +PROLOGUE(mpn_mul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mfi + adds r15 = -1, n C M I + mov f9 = f0 C F + mov.i r2 = ar.lc C I0 +} +{.mmi + ldf8 f7 = [up], 8 C M + nop.m 0 C M + and r14 = 3, n C M I + ;; +} +.Lcommon: +{.mii + setf.sig f6 = vl C M2 M3 + shr.u r31 = r15, 2 C I0 + cmp.eq p10, p0 = 0, r14 C M I +} +{.mii + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + nop.i 0 C I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: mov r20 = 0 + br.cloop.dptk .grt1 C B + + xma.l f39 = f7, f6, f9 C F + xma.hu f43 = f7, f6, f9 C F + ;; + getf.sig r8 = f43 C M2 + stf8 [rp] = f39 C M2 M3 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B + +.grt1: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f7, f6, f9 + xma.hu f43 = f7, f6, f9 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt5 + + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + stf8 [rp] = f39, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + br .Lcj5 + +.grt5: + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + br .LL01 + + +.Lb10: ldf8 f35 = [up], 8 + mov r23 = 0 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + stf8 [rp] = f38, 8 + xma.l f39 = f35, f6, f42 + xma.hu f43 = f35, f6, f42 + ;; + getf.sig r8 = f43 + stf8 [rp] = f39 + mov.i ar.lc = r2 + br.ret.sptk.many b0 + + +.grt2: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt6 + + stf8 [rp] = f38, 8 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + br .Lcj6 + +.grt6: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + br .LL10 + + +.Lb11: ldf8 f34 = [up], 8 + mov r22 = 0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + stf8 [rp] = f37, 8 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + getf.sig r8 = f43 + br .Lcj3 + +.grt3: + ldf8 f32 = [up], 8 + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r19 = f37 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt7 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + getf.sig r21 = f43 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + st8 [rp] = r19, 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + br .Lcj7 + +.grt7: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + br .LL11 + + +.Lb00: ldf8 f33 = [up], 8 + mov r21 = 0 + ;; + ldf8 f34 = [up], 8 + ;; + ldf8 f35 = [up], 8 + xma.l f36 = f7, f6, f9 + xma.hu f40 = f7, f6, f9 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + stf8 [rp] = f36, 8 + xma.l f39 = f35, f6, f0 + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + br .Lcj4 + +.grt4: + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + getf.sig r23 = f41 + xma.hu f43 = f35, f6, f0 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt8 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + st8 [rp] = r18, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + br .Lcj8 + +.grt8: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + br .LL00 + + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; +.LL00: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + nop.b 0 + ldf8 f32 = [up], 8 + (p9) add r24 = r18, r21 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.LL11: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + nop.b 0 + ldf8 f33 = [up], 8 + (p7) add r24 = r19, r22 + nop.b 0 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.LL10: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + nop.b 0 + ldf8 f34 = [up], 8 + (p9) add r24 = r16, r23 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.LL01: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + nop.b 0 + ldf8 f35 = [up], 8 + (p7) add r24 = r17, r20 + br.cloop.dptk .Loop +C *** MAIN LOOP END *** + ;; + +.Lcj9: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.Lcj8: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj7: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj6: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + getf.sig r16 = f38 + st8 [rp] = r24, 8 + ;; +.Lcj5: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + (p8) cmp.leu p6, p7 = r24, r18 + (p9) cmp.ltu p6, p7 = r24, r18 + getf.sig r17 = f39 + st8 [rp] = r24, 8 + ;; +.Lcj4: + .pred.rel "mutex",p6,p7 + getf.sig r8 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r19 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj3: + .pred.rel "mutex",p8,p9 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + st8 [rp] = r24, 8 + (p8) cmp.leu p6, p7 = r24, r16 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj2: + .pred.rel "mutex",p6,p7 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + (p8) add r8 = 1, r8 + mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + adds r15 = -1, n C M I + setf.sig f9 = cy C M2 M3 + mov.i r2 = ar.lc C I0 +} +{.mmb + ldf8 f7 = [up], 8 C M + and r14 = 3, n C M I + br.sptk .Lcommon + ;; +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mul_2.asm b/gmp-6.3.0/mpn/ia64/mul_2.asm new file mode 100644 index 0000000..5343f64 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mul_2.asm @@ -0,0 +1,625 @@ +dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store +dnl store the result to a (n+1)-limb number. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2004, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C TODO +C * Clean up variable names, and try to decrease the number of distinct +C registers used. +C * Clean up feed-in code to not require zeroing several registers. +C * Make sure we don't depend on uninitialized predicate registers. +C * Could perhaps save a few cycles by using 1 c/l carry propagation in +C wind-down code. +C * Ultimately rewrite. The problem with this code is that it first uses a +C loaded u value in one xma pair, then leaves it live over several unrelated +C xma pairs, before it uses it again. It should actually be quite possible +C to just swap some aligned xma pairs around. But we should then schedule +C u loads further from the first use. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`n',`r34') +define(`vp',`r35') + +define(`srp',`r3') + +define(`v0',`f6') +define(`v1',`f7') + +define(`s0',`r14') +define(`acc0',`r15') + +define(`pr0_0',`r16') define(`pr0_1',`r17') +define(`pr0_2',`r18') define(`pr0_3',`r19') + +define(`pr1_0',`r20') define(`pr1_1',`r21') +define(`pr1_2',`r22') define(`pr1_3',`r23') + +define(`acc1_0',`r24') define(`acc1_1',`r25') +define(`acc1_2',`r26') define(`acc1_3',`r27') + +dnl define(`',`r28') +dnl define(`',`r29') +dnl define(`',`r30') +dnl define(`',`r31') + +define(`fp0b_0',`f8') define(`fp0b_1',`f9') +define(`fp0b_2',`f10') define(`fp0b_3',`f11') + +define(`fp1a_0',`f12') define(`fp1a_1',`f13') +define(`fp1a_2',`f14') define(`fp1a_3',`f15') + +define(`fp1b_0',`f32') define(`fp1b_1',`f33') +define(`fp1b_2',`f34') define(`fp1b_3',`f35') + +define(`fp2a_0',`f36') define(`fp2a_1',`f37') +define(`fp2a_2',`f38') define(`fp2a_3',`f39') + +define(`u_0',`f44') define(`u_1',`f45') +define(`u_2',`f46') define(`u_3',`f47') + +define(`ux',`f49') +define(`uy',`f51') + +ASM_START() +PROLOGUE(mpn_mul_2) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; nop 1 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; nop 1 C M + cmp.eq p10, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; nop 1 C M + cmp.eq p12, p0 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p10) br.dptk L(b01) C B + (p11) br.dptk L(b10) C B + (p12) br.dptk L(b11) C B + ;; +} + ALIGN(32) +L(b00): ldf8 u_1 = [up], 8 + mov acc1_2 = 0 + mov pr1_2 = 0 + mov pr0_3 = 0 + cmp.ne p8, p9 = r0, r0 + ;; + xma.l fp0b_3 = ux, v0, f0 + cmp.ne p12, p13 = r0, r0 + ldf8 u_2 = [up], 8 + xma.hu fp1a_3 = ux, v0, f0 + br.cloop.dptk L(gt4) + + xma.l fp0b_0 = uy, v0, f0 + xma.hu fp1a_0 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_3 + xma.l fp1b_3 = ux, v1, fp1a_3 + xma.hu fp2a_3 = ux, v1, fp1a_3 + ;; + xma.l fp0b_1 = u_1, v0, f0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + getfsig pr1_3 = fp1b_3 + getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + br L(cj4) + +L(gt4): xma.l fp0b_0 = uy, v0, f0 + xma.hu fp1a_0 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_3 + xma.l fp1b_3 = ux, v1, fp1a_3 + ldf8 u_3 = [up], 8 + xma.hu fp2a_3 = ux, v1, fp1a_3 + ;; + xma.l fp0b_1 = u_1, v0, f0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + xma.l fp0b_2 = u_2, v0, f0 + ;; + getfsig acc1_3 = fp2a_3 + xma.hu fp1a_2 = u_2, v0, f0 + br L(00) + + + ALIGN(32) +L(b01): ldf8 u_0 = [up], 8 C M + mov acc1_1 = 0 C M I + mov pr1_1 = 0 C M I + mov pr0_2 = 0 C M I + cmp.ne p6, p7 = r0, r0 C M I + ;; + xma.l fp0b_2 = ux, v0, f0 C F + cmp.ne p10, p11 = r0, r0 C M I + ldf8 u_1 = [up], 8 C M + xma.hu fp1a_2 = ux, v0, f0 C F + ;; + xma.l fp0b_3 = uy, v0, f0 C F + xma.hu fp1a_3 = uy, v0, f0 C F + ;; + getfsig acc0 = fp0b_2 C M + xma.l fp1b_2 = ux, v1,fp1a_2 C F + ldf8 u_2 = [up], 8 C M + xma.hu fp2a_2 = ux, v1,fp1a_2 C F + br.cloop.dptk L(gt5) + + xma.l fp0b_0 = u_0, v0, f0 C F + xma.hu fp1a_0 = u_0, v0, f0 C F + ;; + getfsig pr0_3 = fp0b_3 C M + xma.l fp1b_3 = uy, v1,fp1a_3 C F + xma.hu fp2a_3 = uy, v1,fp1a_3 C F + ;; + getfsig pr1_2 = fp1b_2 C M + getfsig acc1_2 = fp2a_2 C M + xma.l fp0b_1 = u_1, v0, f0 C F + xma.hu fp1a_1 = u_1, v0, f0 C F + br L(cj5) + +L(gt5): xma.l fp0b_0 = u_0, v0, f0 + xma.hu fp1a_0 = u_0, v0, f0 + ;; + getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = uy, v1, fp1a_3 + xma.hu fp2a_3 = uy, v1, fp1a_3 + ;; + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + xma.l fp0b_1 = u_1, v0, f0 + ;; + getfsig acc1_2 = fp2a_2 + xma.hu fp1a_1 = u_1, v0, f0 + br L(01) + + + ALIGN(32) +L(b10): br.cloop.dptk L(gt2) + xma.l fp0b_1 = ux, v0, f0 + xma.hu fp1a_1 = ux, v0, f0 + ;; + xma.l fp0b_2 = uy, v0, f0 + xma.hu fp1a_2 = uy, v0, f0 + ;; + stf8 [rp] = fp0b_1, 8 + xma.l fp1b_1 = ux, v1, fp1a_1 + xma.hu fp2a_1 = ux, v1, fp1a_1 + ;; + getfsig acc0 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + getfsig pr1_1 = fp1b_1 + getfsig acc1_1 = fp2a_1 + mov ar.lc = r2 + getfsig pr1_2 = fp1b_2 + getfsig r8 = fp2a_2 + ;; + add s0 = pr1_1, acc0 + ;; + st8 [rp] = s0, 8 + cmp.ltu p8, p9 = s0, pr1_1 + sub r31 = -1, acc1_1 + ;; + .pred.rel "mutex", p8, p9 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + (p8) cmp.leu p10, p0 = r31, pr1_2 + (p9) cmp.ltu p10, p0 = r31, pr1_2 + ;; + st8 [rp] = acc0, 8 + (p10) add r8 = 1, r8 + br.ret.sptk.many b0 + +L(gt2): ldf8 u_3 = [up], 8 + mov acc1_0 = 0 + mov pr1_0 = 0 + ;; + mov pr0_1 = 0 + xma.l fp0b_1 = ux, v0, f0 + ldf8 u_0 = [up], 8 + xma.hu fp1a_1 = ux, v0, f0 + ;; + xma.l fp0b_2 = uy, v0, f0 + xma.hu fp1a_2 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_1 + xma.l fp1b_1 = ux, v1, fp1a_1 + xma.hu fp2a_1 = ux, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + xma.l fp0b_3 = u_3, v0, f0 + xma.hu fp1a_3 = u_3, v0, f0 + ;; + getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + ;; + {.mfi; getfsig acc1_1 = fp2a_1 + xma.l fp0b_0 = u_0, v0, f0 + cmp.ne p8, p9 = r0, r0 +}{.mfb; cmp.ne p12, p13 = r0, r0 + xma.hu fp1a_0 = u_0, v0, f0 + br L(10) +} + + ALIGN(32) +L(b11): mov acc1_3 = 0 + mov pr1_3 = 0 + mov pr0_0 = 0 + ldf8 u_2 = [up], 8 + cmp.ne p6, p7 = r0, r0 + br.cloop.dptk L(gt3) + ;; + xma.l fp0b_0 = ux, v0, f0 + xma.hu fp1a_0 = ux, v0, f0 + ;; + cmp.ne p10, p11 = r0, r0 + xma.l fp0b_1 = uy, v0, f0 + xma.hu fp1a_1 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_0 + xma.l fp1b_0 = ux, v1, fp1a_0 + xma.hu fp2a_0 = ux, v1, fp1a_0 + ;; + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + getfsig pr1_0 = fp1b_0 + getfsig acc1_0 = fp2a_0 + br L(cj3) + +L(gt3): xma.l fp0b_0 = ux, v0, f0 + cmp.ne p10, p11 = r0, r0 + ldf8 u_3 = [up], 8 + xma.hu fp1a_0 = ux, v0, f0 + ;; + xma.l fp0b_1 = uy, v0, f0 + xma.hu fp1a_1 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_0 + xma.l fp1b_0 = ux, v1, fp1a_0 + ldf8 u_0 = [up], 8 + xma.hu fp2a_0 = ux, v1, fp1a_0 + ;; + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + ;; + getfsig acc1_0 = fp2a_0 + xma.l fp0b_3 = u_3, v0, f0 + xma.hu fp1a_3 = u_3, v0, f0 + br L(11) + + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): C 00 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + (p8) cmp.leu p6, p7 = acc0, pr0_1 + (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; C 01 + .pred.rel "mutex", p6, p7 + getfsig acc1_2 = fp2a_2 + st8 [rp] = s0, 8 + xma.l fp0b_1 = u_1, v0, f0 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; C 02 +L(01): + .pred.rel "mutex", p10, p11 + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; C 03 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; C 04 + .pred.rel "mutex", p8, p9 + getfsig acc1_3 = fp2a_3 + st8 [rp] = s0, 8 + xma.l fp0b_2 = u_2, v0, f0 + (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, f0 + ;; C 05 +L(00): + .pred.rel "mutex", p12, p13 + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 + (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; C 06 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + (p8) cmp.leu p6, p7 = acc0, pr0_3 + (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; C 07 + .pred.rel "mutex", p6, p7 + getfsig acc1_0 = fp2a_0 + st8 [rp] = s0, 8 + xma.l fp0b_3 = u_3, v0, f0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + xma.hu fp1a_3 = u_3, v0, f0 + ;; C 08 +L(11): + .pred.rel "mutex", p10, p11 + getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 + (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; C 09 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + (p6) cmp.leu p8, p9 = acc0, pr0_0 + (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; C 10 + .pred.rel "mutex", p8, p9 + getfsig acc1_1 = fp2a_1 + st8 [rp] = s0, 8 + xma.l fp0b_0 = u_0, v0, f0 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + xma.hu fp1a_0 = u_0, v0, f0 + ;; C 11 +L(10): + .pred.rel "mutex", p12, p13 + getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + br.cloop.dptk L(top) + ;; +C *** MAIN LOOP END *** + + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mfi; getfsig acc1_2 = fp2a_2 + xma.l fp0b_1 = u_1, v0, f0 + nop 1 +}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; +} +L(cj5): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 +}{.mfi; (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_3 = fp1b_3 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_2 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mfi; getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, f0 + nop 1 +}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, f0 + ;; +} +L(cj4): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 +}{.mfi; (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_0 = fp1b_0 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_3 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig acc1_0 = fp2a_0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + ;; +} +L(cj3): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 +}{.mfi; (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_1 = fp1b_1 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_0 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; getfsig acc1_1 = fp2a_1 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + ;; +} .pred.rel "mutex", p12, p13 + {.mmi; (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig r8 = fp2a_2 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; st8 [rp] = s0, 8 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 + (p9) cmp.ltu p10, p11 = acc0, pr1_2 + (p12) add acc0 = 1, acc0 + ;; +}{.mmi; st8 [rp] = acc0, 8 + (p12) cmpeqor p10, p0 = 0, acc0 + nop 1 + ;; +}{.mib; (p10) add r8 = 1, r8 + mov ar.lc = r2 + br.ret.sptk.many b0 +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/popcount.asm b/gmp-6.3.0/mpn/ia64/popcount.asm new file mode 100644 index 0000000..c0b5c5c --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/popcount.asm @@ -0,0 +1,200 @@ +dnl IA-64 mpn_popcount -- mpn population count. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1.5 +C Itanium 2: 1 + +C INPUT PARAMETERS +define(`up', `r32') +define(`n', `r33') + +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') +define(`s',`r8') + + +ASM_START() +PROLOGUE(mpn_popcount) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 up = 0, up C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; add r9 = 512, up C prefetch pointer M I + ld8 r10 = [up], 8 C load first limb M01 + mov.i r2 = ar.lc C save ar.lc I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p14 = 4, n C small count? M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + + +.Lb00: ld8 u1 = [up], 8 C M01 + shr.u n = n, 2 C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = r10 C I0 + mov.i ar.lc = n C I0 + ;; + ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + (p15) br.cond.dptk .grt4 C B + ;; + nop.m 0 C - + nop.m 0 C - + popcnt c2 = u2 C I0 + ;; + mov s = c0 C M I + popcnt c3 = u3 C I0 + br .Lcj4 C B + +.grt4: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + br .LL00 C B + + +.Lb01: + popcnt s = r10 C I0 + (p14) br.ret.sptk.many b0 C B + +.grt1: ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 u1 = [up], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + mov c3 = 0 C I0 + + ;; + ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + br.cloop.dptk .Loop C B + br .Lend C B + + +.Lb10: ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + (p15) br.cond.dptk .grt2 C B + + popcnt s = r10 C I0 + ;; + popcnt c3 = u3 C I0 + br .Lcj2 C B + +.grt2: ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 + popcnt c2 = r10 C I0 + ;; + ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + br .LL10 C B + + +.Lb11: ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 + mov s = 0 C M I + ;; + ld8 u3 = [up], 8 C M01 + popcnt s = r10 C I0 + (p15) br.cond.dptk .grt3 C B + + popcnt c2 = u2 C I0 + ;; + popcnt c3 = u3 C I0 + br .Lcj3 C B + +.grt3: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + mov.i ar.lc = n C I0 + mov c1 = 0 + ;; + ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + br .LL11 C B + + +.Loop: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + add s = s, c3 C M I + ;; +.LL00: ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + add s = s, c0 C M I + ;; +.LL11: ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + add s = s, c1 C M I + ;; +.LL10: ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + add s = s, c2 C M I + lfetch [r9], 32 C M01 + nop.m 0 C - + br.cloop.dptk .Loop C B + ;; + +.Lend: popcnt c2 = u2 C I0 + add s = s, c3 C M I + ;; + popcnt c3 = u3 C I0 + add s = s, c0 C M I + ;; +.Lcj4: add s = s, c1 C M I + ;; +.Lcj3: add s = s, c2 C M I + ;; +.Lcj2: add s = s, c3 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm b/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm new file mode 100644 index 0000000..3c7defb --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm @@ -0,0 +1,447 @@ +dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.5 +C Itanium 2: 1.5 + +C TODO +C * Rewrite function entry code using aorslsh1_n.asm style. +C * Micro-optimize feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`vp',`r34') +define(`n',`r35') + +ifdef(`OPERATION_rsh1add_n',` + define(ADDSUB, add) + define(PRED, ltu) + define(INCR, 1) + define(LIM, -1) + define(func, mpn_rsh1add_n) +') +ifdef(`OPERATION_rsh1sub_n',` + define(ADDSUB, sub) + define(PRED, gtu) + define(INCR, -1) + define(LIM, 0) + define(func, mpn_rsh1sub_n) +') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') +define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') +define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + nop.m 0 + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov.i r2 = ar.lc C I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C M I + add n = -4, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + (p15) br.dpnt .grt4 C B + ;; + + cmp.PRED p7, p0 = w3, r10 C M I + and r8 = 1, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + ADDSUB w1 = u1, v1 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + br .Lcj4 C B + +.grt4: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, r10 C M I + ld8 u3 = [up], 8 C M01 + and r8 = 1, w3 C M I + ;; + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + add n = -1, n + ;; + cmp.PRED p8, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; + ld8 v1 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + shrp x3 = w0, w3, 1 C I0 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + br .LL00 C B + + +.Lb01: ADDSUB w2 = r10, r11 C M I + shr.u n = n, 2 C I0 + (p15) br.dpnt .grt1 C B + ;; + + cmp.PRED p6, p7 = w2, r10 C M I + shr.u x2 = w2, 1 C I0 + and r8 = 1, w2 C M I + ;; + (p6) dep x2 = -1, x2, 63, 1 C I0 + br .Lcj1 C B + +.grt1: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C FIXME swap with next I0 + ;; + ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.PRED p6, p0 = w2, r10 C M I + and r8 = 1, w2 C M I + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt5 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + ;; + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ADDSUB w1 = u1, v1 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + br .Lcj5 C B + +.grt5: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + ;; + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + br .LL01 C B + + +.Lb10: ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ADDSUB w1 = r10, r11 C M I + (p15) br.dpnt .grt2 C B + ;; + + cmp.PRED p9, p0 = w1, r10 C M I + and r8 = 1, w1 C M I + ADDSUB w2 = u2, v2 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + ;; + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + shr.u x2 = w2, 1 C I0 + br .Lcj2 C B + +.grt2: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, r10 C M I + ld8 u1 = [up], 8 C M01 + and r8 = 1, w1 C M I + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + ;; + cmp.PRED p6, p0 = w2, u2 C M I + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt6 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + br .Lcj6 C B + +.grt6: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + br .LL10 C B + + +.Lb11: ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I + (p15) br.dpnt .grt3 C B + ;; + + cmp.PRED p8, p0 = w0, r10 C M I + ADDSUB w1 = u1, v1 C M I + and r8 = 1, w0 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + ;; + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ;; + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .Lcj3 C B + +.grt3: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + cmp.PRED p8, p0 = w0, r10 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + and r8 = 1, w0 C M I + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt7 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .Lcj7 C B + +.grt7: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .LL11 C B + + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: st8 [rp] = x3, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.LL11: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; +.LL10: cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + nop.b 0 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + nop.b 0 + ;; + st8 [rp] = x1, 8 C M23 + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; +.LL01: st8 [rp] = x2, 8 C M23 + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; +.LL00: cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + nop.b 0 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .Loop C B + ;; +C *** MAIN LOOP END *** + +.Lskip: st8 [rp] = x3, 8 C M23 + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.Lcj7: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; +.Lcj6: cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ADDSUB w1 = u1, v1 C M I + ;; + st8 [rp] = x1, 8 C M23 + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; +.Lcj5: st8 [rp] = x2, 8 C M23 + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; +.Lcj4: cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ;; + st8 [rp] = x3, 8 C M23 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.Lcj3: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + shr.u x2 = w2, 1 C I0 + ;; +.Lcj2: st8 [rp] = x1, 8 C M23 + (p6) dep x2 = -1, x2, 63, 1 C I0 + ;; +.Lcj1: st8 [rp] = x2 C M23 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/sec_tabselect.asm b/gmp-6.3.0/mpn/ia64/sec_tabselect.asm new file mode 100644 index 0000000..9b11cde --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/sec_tabselect.asm @@ -0,0 +1,148 @@ +dnl IA-64 mpn_sec_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2.5 + +C NOTES +C * Using software pipelining could trivially yield 2 c/l without unrolling, +C or 1+epsilon with unrolling. (This code was modelled after the powerpc64 +C code, for simplicity.) + +C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r32') +define(`tp', `r33') +define(`n', `r34') +define(`nents', `r35') +define(`which', `r36') + +define(`mask', `r8') + +define(`rp1', `r32') +define(`tp1', `r33') +define(`rp2', `r14') +define(`tp2', `r15') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 tp = 0, tp C M I + zxt4 n = n C I +}{.mii; nop 0 + zxt4 nents = nents C I + zxt4 which = which C I + ;; +}') + {.mmi; add rp2 = 8, rp1 + add tp2 = 8, tp1 + add r6 = -2, n + ;; +}{.mmi; cmp.eq p10, p0 = 1, n + and r9 = 1, n C set cr0 for use in inner loop + shr.u r6 = r6, 1 C inner loop count + ;; +}{.mmi; cmp.eq p8, p0 = 0, r9 + sub which = nents, which + shl n = n, 3 + ;; +} +L(outer): + {.mmi; cmp.eq p6, p7 = which, nents C are we at the selected table entry? + nop 0 + mov ar.lc = r6 C I0 + ;; +}{.mmb; + (p6) mov mask = -1 + (p7) mov mask = 0 + (p8) br.dptk L(top) C branch to loop entry if n even + ;; +}{.mmi; ld8 r16 = [tp1], 8 + add tp2 = 8, tp2 + nop 0 + ;; +}{.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 + ;; +}{.mmi; andcm r18 = r18, mask + ;; + or r16 = r16, r18 + nop 0 + ;; +}{.mmb; st8 [rp1] = r16, 8 + add rp2 = 8, rp2 + (p10) br.dpnt L(end) +} + ALIGN(32) +L(top): + {.mmi; ld8 r16 = [tp1], 16 + ld8 r17 = [tp2], 16 + nop 0 + ;; +}{.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 +}{.mmi; ld8 r19 = [rp2] + and r17 = r17, mask + nop 0 + ;; +}{.mmi; andcm r18 = r18, mask + andcm r19 = r19, mask + nop 0 + ;; +}{.mmi; or r16 = r16, r18 + or r17 = r17, r19 + nop 0 + ;; +}{.mmb; st8 [rp1] = r16, 16 + st8 [rp2] = r17, 16 + br.cloop.dptk L(top) + ;; +} +L(end): + {.mmi; sub rp1 = rp1, n C move rp back to beginning + sub rp2 = rp2, n C move rp back to beginning + cmp.ne p9, p0 = 1, nents +}{.mmb; add nents = -1, nents + nop 0 + (p9) br.dptk L(outer) + ;; +}{.mib; nop 0 + nop 0 + br.ret.sptk.many b0 +} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..727f489 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm @@ -0,0 +1,156 @@ +dnl IA-64 mpn_sqr_diag_addlsh1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon + +C Exact performance table. The 2nd line is this code, the 3rd line is ctop- +C less code. In an assembly sqr_basecase, the ctop-full numbers will become a +C few cycles better since we can mitigate the many I0 instructions. +C +C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating +C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43 + +C We should keep in mind that this code takes linear time in a O(n^2) context +C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become +C around 60. Keeping overhead down for smallish operands (< 10) is more +C important than optimal cycle counts. + +C TODO +C * Make sure we don't depend on uninitialised r-registers, f-registers, or +C * p-registers. +C * Optimise by doing first two loop iterations in function header. + +C INPUT PARAMETERS +define(`rp_param', `r32') define(`rp', `r14') C size: 2n +define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2 +define(`up_param', `r34') define(`up', `r31') C size: n +define(`n', `r35') + +ifdef(`HAVE_ABI_32',` + define(`ABI64', `') + define(`ABI32', `$1') +',` + define(`ABI64', `$1') + define(`ABI32', `') +') + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + + .prologue + .save ar.pfs, r2 + .save ar.lc, r3 + .body + + {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M + mov r3 = ar.lc C I0 + ABI64(` nop 4711 ') + ABI32(` zxt4 n = n ') +}{.mmi; ABI64(` mov tp = tp_param ') C M I + ABI32(` addp4 tp = 0, tp_param') C M I + ABI64(` mov up = up_param ') C M I + ABI32(` addp4 up = 0, up_param') C M I + ABI64(` mov rp = rp_param ') C M I + ABI32(` addp4 rp = 0, rp_param') C M I + ;; +}{.mmi; ld8 r36 = [tp], 8 C M + add r20 = -2, n C M I + mov r9 = ar.ec C I0 + ;; +}{.mmi; ld8 r32 = [tp], 8 C M + mov r16 = 0 C M I + mov ar.ec = 7 C I0 + ;; +}{.mmi; nop 4711 + mov r44 = 0 C M I + mov ar.lc = r20 C I0 + ;; +}{.mii; mov r33 = 0 + mov r10 = pr C I0 + mov pr.rot = 0x30000 C I0 + ;; +} br.cexit.spnt.few.clr L(end) + +dnl *** MAIN LOOP START *** + ALIGN(32) +L(top): + {.mfi; (p18) ldf8 f33 = [up], 8 C M + (p20) xma.l f36 = f35, f35, f42 C F + (p41) cmpequc p50, p0 = -1, r44 C M I +}{.mfi; setfsig f40 = r16 C M23 + (p20) xma.hu f38 = f35, f35, f42 C F + (p23) add r50 = r41, r49 C M I + ;; +}{.mmi; (p16) ld8 r36 = [tp], 8 C M + (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I + (p19) shrp r45 = r38, r35, 63 C non-critical I0 +}{.mmi; (p21) getfsig r39 = f39 C hi M2 + (p24) st8 [rp] = r51, 8 C hi M23 + (p41) add r44 = 1, r44 C M I + ;; +}{.mmi; (p16) ld8 r32 = [tp], 8 C M + (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I + (p17) shrp r16 = r33, r37, 63 C critical I0 +}{.mmi; (p21) getfsig r42 = f37 C lo M2 + (p23) st8 [rp] = r44, 8 C lo M23 + (p50) add r50 = 1, r50 C M I + ;; +} br.ctop.sptk.few.clr L(top) C B +dnl *** MAIN LOOP END *** + ;; +L(end): + {.mmi; nop 4711 + (p41) add r44 = 1, r44 C M I + shr.u r48 = r39, 63 C I0 + ;; +}{.mmi; st8 [rp] = r51, 8 C M23 + (p41) cmpequc p6, p0 = 0, r44 C M I + add r50 = r41, r48 C M I + ;; +}{.mmi; st8 [rp] = r44, 8 C M23 + (p6) add r50 = 1, r50 C M I + mov ar.lc = r3 C I0 + ;; +}{.mii; st8 [rp] = r50 C M23 + mov ar.ec = r9 C I0 + mov pr = r10 C I0 + ;; +}{.mib; nop 4711 + mov ar.pfs = r2 C I0 + br.ret.sptk.many b0 C B +} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/submul_1.asm b/gmp-6.3.0/mpn/ia64/submul_1.asm new file mode 100644 index 0000000..cb2a552 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/submul_1.asm @@ -0,0 +1,647 @@ +dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) + +C TODO +C * Optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in +C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and +C save a cycle. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') + +ASM_START() +PROLOGUE(mpn_submul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + mov r10 = rp C M I + mov r9 = up C M I + sub vl = r0, vl C M I negate vl +} +{.mmi + ldf8 f8 = [rp], 8 C M + ldf8 f7 = [up], 8 C M + add r19 = -1, n C M I n - 1 + ;; +} +{.mmi + cmp.eq p6, p0 = 0, vl C M I + mov r8 = 0 C M I zero cylimb + mov r2 = ar.lc C I0 +} +{.mmi + setf.sig f6 = vl C M2 M3 + and r14 = 3, n C M I + shr.u r19 = r19, 2 C I0 + ;; +} +{.mmb + nop 0 + cmp.eq p10, p0 = 0, r14 C M I + (p6) br.spnt .Ldone C B vl == 0 +} +{.mmi + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + mov ar.lc = r19 C I0 +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 + + xma.l f39 = f7, f6, f8 + xma.hu f43 = f7, f6, f8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj1 + +.grt1: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + xma.l f39 = f7, f6, f8 + ldf8 f34 = [up], 8 + xma.hu f43 = f7, f6, f8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f32, f6, f44 + ldf8 f35 = [up], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt5 + ;; + + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + br .Lcj5 + +.grt5: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .Loop + br .Lend + + +.Lb10: ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj2 + +.grt2: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt6 + + getf.sig r30 = f42 C hi + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .Lcj6 + +.grt6: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .LL10 + + +.Lb11: ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + + xma.l f37 = f7, f6, f8 + xma.hu f41 = f7, f6, f8 + ;; + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + ld8 r22 = [r9], 8 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj3 + +.grt3: ldf8 f44 = [rp], 8 + xma.l f37 = f7, f6, f8 + ldf8 f32 = [up], 8 + xma.hu f41 = f7, f6, f8 + ;; + ldf8 f45 = [rp], 8 + xma.l f38 = f34, f6, f46 + ldf8 f33 = [up], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt7 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .Lcj7 + +.grt7: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .LL11 + + +.Lb00: ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f7, f6, f8 + ldf8 f35 = [up], 8 + xma.hu f40 = f7, f6, f8 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + ;; + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + ld8 r20 = [r9], 8 + br .Lcj4 + +.grt4: ldf8 f44 = [rp], 8 + xma.l f37 = f33, f6, f45 + ldf8 f32 = [up], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + getf.sig r24 = f36 C lo + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt8 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .Lcj8 + +.grt8: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .LL00 + + ALIGN(32) +.Loop: +{.mmi + ldf8 f44 = [rp], 8 + cmp.ltu p6, p0 = r27, r8 C lo cmp + sub r14 = r27, r8 C lo sub +} +{.mmi + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + sub r8 = r20, r31 C hi sub + ;; C 01 +} +{.mmf + getf.sig r27 = f39 C lo + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 +} +{.mfi + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; C 02 +} +{.mmi +.LL00: ldf8 f45 = [rp], 8 + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 +} +{.mmi + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + sub r8 = r21, r28 + ;; C 03 +} +{.mmf + getf.sig r24 = f36 C lo + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 +} +{.mfi + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; C 04 +} +{.mmi +.LL11: ldf8 f46 = [rp], 8 + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 +} +{.mmi + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + sub r8 = r22, r29 + ;; C 05 +} +{.mmf + getf.sig r25 = f37 C lo + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 +} +{.mfi + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; C 06 +} +{.mmi +.LL10: ldf8 f47 = [rp], 8 + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 +} +{.mmi + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + sub r8 = r23, r30 + ;; C 07 +} +{.mmf + getf.sig r26 = f38 C lo + st8 [r10] = r14, 8 + xma.l f36 = f32, f6, f44 +} +{.mfi + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + (p6) add r8 = 1, r8 +} + br.cloop.dptk .Loop + ;; + +.Lend: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; +.Lcj8: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + getf.sig r24 = f36 + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; +.Lcj7: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + getf.sig r28 = f40 + sub r8 = r22, r29 + ;; + getf.sig r25 = f37 + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; +.Lcj6: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + getf.sig r29 = f41 + sub r8 = r23, r30 + ;; + getf.sig r26 = f38 + st8 [r10] = r14, 8 + ld8 r23 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj5: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + ld8 r20 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj4: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj3: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + sub r8 = r22, r29 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj2: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + sub r8 = r23, r30 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj1: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + sub r8 = r20, r31 + ;; + st8 [r10] = r14, 8 + mov ar.lc = r2 + (p6) add r8 = 1, r8 + br.ret.sptk.many b0 +.Ldone: mov ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() -- cgit v1.2.3