From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/aors_n.asm | 852 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 852 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/aors_n.asm (limited to 'gmp-6.3.0/mpn/ia64/aors_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aors_n.asm b/gmp-6.3.0/mpn/ia64/aors_n.asm new file mode 100644 index 0000000..7705ce6 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aors_n.asm @@ -0,0 +1,852 @@ +dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.67 +C Itanium 2: 1.25 + +C TODO +C * Consider using special code for small n, using something like +C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code. +C * The non-nc code was trimmed cycle for cycle to its current state. It is +C probably hard to save more that an odd cycle there. The nc code is much +C cruder (since tune/speed doesn't have any applicable direct measurements). +C * Without the nc entry points, this becomes around 1800 bytes of object +C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a +C few cycles for the non-nc code and let it fall into the nc code. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') +define(`cy', `r36') + +ifdef(`OPERATION_add_n',` + define(ADDSUB, add) + define(CND, ltu) + define(INCR, 1) + define(LIM, -1) + define(LIM2, 0) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) +') +ifdef(`OPERATION_sub_n',` + define(ADDSUB, sub) + define(CND, gtu) + define(INCR, -1) + define(LIM, 0) + define(LIM2, -1) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) +') + +define(PFDIST, 500) + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') +define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') +define(`rpx',`r3') +define(`upadv',`r20') define(`vpadv',`r21') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in + add vpadv = PFDIST, vp C code could save a cycle per call at + mov r23 = cy C the expense of code size. + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lc001 C B + (p7) br.dptk .Lc010 C B + (p8) br.dptk .Lc011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lc100 C B + (p10) br.dptk .Lc101 C B + (p11) br.dptk .Lc110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = 0, cy C copy cy to p13 M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + nop 0 + (p12) br.dptk .Lc111 C B + ;; +} + +.Lc000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v0 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r18 C M I + (p13) cmpeqor p7, p0 = LIM, w1 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + (p13) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) +} + +.Lc001: + {.mmi; (p15) ld8 v1 = [vp], 8 C M01 + (p15) ld8 u1 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmb; nop 0 + nop 0 + (p15) br L(0) + ;; +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + mov r8 = 0 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + br L(cj1) C B +} +L(0): + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; nop 0 + cmp.ne p9, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + add rpx = 16, rp C M I +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br L(c1) C B +} + +.Lc010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov r8 = 0 C M I +}{.mmb; ADDSUB w3 = r10, r11 C M I + cmp.ne p8, p0 = 0, r23 C M I + (p15) br L(1) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + (p8) add w3 = INCR, w3 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, u0 C M I + (p8) cmpeqor p9, p0 = LIM2, w3 C M I + br L(cj2) C B +} +L(1): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ;; +}{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; add rpx = 24, rp C M I + nop 0 + br L(m23) C B +} + +.Lc011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w2 = r10, r11 C M I + cmp.ne p7, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(2) C B +}{.mmi; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + br L(cj3) C B +} +L(2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) +} + +.Lc100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w1 = r10, r11 C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + add rpx = 8, rp C M I +}{.mmi; cmp.ne p6, p0 = 0, r23 C M I + cmp.CND p7, p0 = w1, r10 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I +}{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + (p14) br L(cj4) + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + nop 0 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) +} + +.Lc101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + add rpx = 16, rp C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I +}{.mbb; ADDSUB w1 = u1, v1 C M I + (p15) br L(c5) C B + br L(end) C B +} + +.Lc110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + add vpadv = PFDIST, vp C M I + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = u0, v0 C M I +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + cmp.ne p8, p0 = 0, r23 C M I + add rpx = 24, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 +}{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + br L(m67) C B +} + +.Lc111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + (p13) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmi; ld8 u3 = [up], 8 C M01 + (p13) add w2 = INCR, w2 C M I + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) +} +EPILOGUE() + +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lb001 C B + (p7) br.dptk .Lb010 C B + (p8) br.dptk .Lb011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lb100 C B + (p10) br.dptk .Lb101 C B + (p11) br.dptk .Lb110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = r0, r0 C clear "CF" M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + mov r23 = 0 C M I + (p12) br.dptk .Lb111 C B + ;; +} + +.Lb000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + cmp.CND p8, p0 = w2, r18 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) C B +} + + ALIGN(32) +.Lb001: + {.mmi; ADDSUB w0 = r10, r11 C M I + (p15) ld8 v1 = [vp], 8 C M01 + mov r8 = 0 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, r10 C M I + (p15) ld8 u1 = [up], 8 C M01 + (p14) br L(cj1) C B + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + shr.u n = n, 3 C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I +}{.mmb; ld8 u1 = [up], 8 C M01 + add rpx = 16, rp C M I + br L(m1) C B +} + + ALIGN(32) +.Lb010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmb; ADDSUB w3 = r10, r11 C M I + nop 0 + (p15) br L(gt2) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + mov r8 = 0 C M I + ;; +}{.mmb; nop 0 + cmp.CND p6, p0 = w0, u0 C M I + br L(cj2) C B +} +L(gt2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m23) C B +} + + ALIGN(32) +.Lb011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(3) C B +}{.mmb; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + br L(cj3) C B +} +L(3): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + ADDSUB w3 = u3, v3 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + nop 0 + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) C B +} + + ALIGN(32) +.Lb100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmb; nop 0 + ADDSUB w2 = u2, v2 C M I + (p14) br L(cj4) C B + ;; +} +L(gt4): + {.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) C B +} + + ALIGN(32) +.Lb101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + add rpx = 16, rp C M I +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + nop 0 +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + (p14) br L(cj5) C B + ;; +} +L(gt5): + {.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + mov ar.lc = n C I0 +}{.mmb; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I + br L(m5) C B +} + + ALIGN(32) +.Lb110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m67) C B +} + + ALIGN(32) +.Lb111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + nop 0 +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +L(c5): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m5): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m4): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [upadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m23): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + nop.b 0 + ;; +L(c1): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m1): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m0): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [vpadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m67): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br.cloop.dptk L(top) C B + ;; +C *** MAIN LOOP END *** + +L(end): + {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + mov ar.lc = r2 C I0 +} +L(cj5): + {.mmi; cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w0, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I +} +L(cj4): + {.mmi; cmp.CND p8, p0 = w2, u2 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w1, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I +} +L(cj3): + {.mmi; cmp.CND p9, p0 = w3, u3 C M I + ADDSUB w0 = u0, v0 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I +}{.mmi; cmp.CND p6, p0 = w0, u0 C M I + nop 0 + mov r8 = 0 C M I + ;; +} +L(cj2): + {.mmi; st8 [rp] = w3, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + ;; +} +L(cj1): + {.mmb; st8 [rp] = w0, 8 C M23 + (p6) mov r8 = 1 C M I + br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() -- cgit v1.2.3