From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm | 412 +++++++++++++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm (limited to 'gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm new file mode 100644 index 0000000..2703ce2 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm @@ -0,0 +1,412 @@ +dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C TODO +C * Use shladd in feed-in code (for mpn_addlshC_n). +C * Rewrite loop to schedule loads closer to use, since we do prefetch. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') + +ifdef(`DO_add', ` + define(`ADDSUB', `add $1 = $2, $3') + define(`CMP', `cmp.ltu $1,p0 = $2, $3') + define(`INCR', 1) + define(`LIM', -1) + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUB', `sub $1 = $2, $3') + define(`CMP', `cmp.gtu $1,p0 = $2, $3') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUB', `sub $1 = $3, $2') + define(`CMP', `cmp.gtu $1,p0 = $2, $4') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(PFDIST, 500) + +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') +define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') +define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29') +define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9') + +C r3 r8 r9 r10 r11 + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov.i r2 = ar.lc C I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x3 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 +}{.mmb; ADDSUB( w3, r10, x3) C M I + nop 0 + (p15) br.dpnt .grt4 C B + ;; +}{.mii; CMP( p7, w3, r10, x3) C M II0 + shrp x1 = v1, v0, 64-LSH C I0 + ADDSUB( w0, u0, x0) C M I + ;; +}{.mii; CMP( p8, w0, u0, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I +}{.mmb; nop 0 + nop 0 + br .Lcj4 C B +} +ALIGN(32) +.grt4: + {.mii; ld8 v3 = [vp], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 + CMP( p8, w3, r10, x3) C M I + ;; +}{.mmi; ld8 u3 = [up], 8 C M01 + add r11 = PFDIST, vp + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + nop 0 + ;; +}{.mmi; CMP( p6, w0, u0, x0) C M I + add r10 = PFDIST, up + mov.i ar.lc = n C I0 +}{.mmb; ADDSUB( w1, u1, x1) C M I + ld8 u0 = [up], 8 C M01 + br .LL00 C B +} + + ALIGN(32) +.Lb01: +ifdef(`DO_add', +` shladd w2 = r11, LSH, r10 C M I + shr.u r8 = r11, 64-LSH C retval I0 + (p15) br.dpnt .grt1 C B + ;; +',` + shl x2 = r11, LSH C I0 + (p15) br.dpnt .grt1 C B + ;; + ADDSUB( w2, r10, x2) C M I + shr.u r8 = r11, 64-LSH C retval I0 + ;; +') + CMP( p6, w2, r10, x2) C M I + br .Lcj1 + +.grt1: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C FIXME swap with next I0 +ifdef(`DO_add', +`',` + ADDSUB( w2, r10, x2) +') + ;; + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, r11, 64-LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; CMP( p6, w2, r10, x2) C M I + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk .grt5 C B + ;; +}{.mmi; CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmb; nop 0 + nop 0 + br .Lcj5 C B +} +.grt5: + {.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + br .LL01 C B +} + ALIGN(32) +.Lb10: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shl x1 = r11, LSH C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt2 C B + ;; +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + shrp x2 = v2, r11, 64-LSH C I0 + ;; +}{.mmi; CMP( p9, w1, r10, x1) C M I + ADDSUB( w2, u2, x2) C M I + shr.u r8 = v2, 64-LSH C retval I0 + ;; +}{.mmb; CMP( p6, w2, u2, x2) C M I + nop 0 + br .Lcj2 C B +} +.grt2: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + nop 0 + ;; +}{.mii; ld8 v1 = [vp], 8 C M01 + shrp x2 = v2, r11, 64-LSH C I0 + CMP( p8, w1, r10, x1) C M I + ;; +}{.mmi; add r10 = PFDIST, up + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 +}{.mmi; add r11 = PFDIST, vp + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; +}{.mmi; CMP( p6, w2, u2, x2) C M I + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; ADDSUB( w3, u3, x3) C M I + nop 0 + br.cloop.dpnt L(top) C B +} + br L(end) C B +.Lb11: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x0 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt3 C B + ;; +}{.mii; nop 0 + shrp x1 = v1, r11, 64-LSH C I0 + ADDSUB( w0, r10, x0) C M I + ;; +}{.mii; CMP( p8, w0, r10, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I + ;; +}{.mmb; CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + br .Lcj3 C B +} +.grt3: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shrp x1 = v1, r11, 64-LSH C I0 +}{.mmi; ADDSUB( w0, r10, x0) C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + CMP( p6, w0, r10, x0) C M I + mov.i ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB( w1, u1, x1) C M I + nop 0 + ;; +}{.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmb; ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + br .LL11 C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): st8 [rp] = w1, 8 C M23 + lfetch [r10], 32 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + (p8) add w2 = INCR, w2 C M I + ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + ;; +.LL01: ld8 u3 = [up], 8 C M01 + shrp x1 = v1, v0, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + ;; + st8 [rp] = w2, 8 C M23 + CMP( p6, w0, u0, x0) C M I + nop.b 0 + ld8 u0 = [up], 8 C M01 + lfetch [r11], 32 + ADDSUB( w1, u1, x1) C M I + ;; +.LL00: st8 [rp] = w3, 8 C M23 + shrp x2 = v2, v1, 64-LSH C I0 + (p8) cmpeqor p6, p0 = LIM, w0 C M I + (p8) add w0 = INCR, w0 C M I + ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + ;; +.LL11: ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; + {.mmi; st8 [rp] = w0, 8 C M23 + CMP( p6, w2, u2, x2) C M I + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; + ld8 u2 = [up], 8 C M01 + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk L(top) C B + ;; +} +C *** MAIN LOOP END *** + +L(end): + {.mmi; st8 [rp] = w1, 8 C M23 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; + (p8) add w2 = INCR, w2 C M I + CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + ;; +} +.Lcj5: + {.mmi; st8 [rp] = w2, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w3 C M I + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmi; + (p6) add w3 = INCR, w3 C M I + CMP( p8, w0, u0, x0) C M I + ADDSUB( w1, u1, x1) C M I + ;; +} +.Lcj4: + {.mmi; st8 [rp] = w3, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w0 C M I + mov.i ar.lc = r2 C I0 +}{.mmi; + (p7) add w0 = INCR, w0 C M I + CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + ;; +} +.Lcj3: + {.mmi; st8 [rp] = w0, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w1 C M I + shr.u r8 = v2, 64-LSH C I0 +}{.mmi; + (p8) add w1 = INCR, w1 C M I + CMP( p6, w2, u2, x2) C M I + nop 0 + ;; +} +.Lcj2: + {.mmi; st8 [rp] = w1, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +} +.Lcj1: + {.mmb; st8 [rp] = w2 C M23 +ifdef(`DO_rsb',` + (p6) add r8 = -1, r8 C M I +',` + (p6) add r8 = 1, r8 C M I +') br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() -- cgit v1.2.3