From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/ia64/lshiftc.asm | 463 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 gmp-6.3.0/mpn/ia64/lshiftc.asm (limited to 'gmp-6.3.0/mpn/ia64/lshiftc.asm') diff --git a/gmp-6.3.0/mpn/ia64/lshiftc.asm b/gmp-6.3.0/mpn/ia64/lshiftc.asm new file mode 100644 index 0000000..e8cec87 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/lshiftc.asm @@ -0,0 +1,463 @@ +dnl IA-64 mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.25 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C The ld8 scheduling should probably be decreased to make the function smaller. +C Good lfetch will make sure we never stall anyway. + +C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair +C at cycle 2. Judicious use of predicates could allow us to issue more ld8's +C in the prologue. + + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +define(`FSH',`shl') +define(`BSH',`shr.u') +define(`UPD',`-8') +define(`POFF',`-512') +define(`PUPD',`-32') +define(`func',`mpn_lshiftc') + +ASM_START() +PROLOGUE(mpn_lshiftc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + nop.m 0 + nop.m 0 + zxt4 cnt = cnt C I + ;; +') + + {.mmi; nop 0 C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + nop 0 + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r15, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I + shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; } + +.Lb00: + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + br.cloop.dptk L(gt4) + ;; + FSH r24 = r10, cnt + BSH r25 = r19, tnc + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + sub r31 = -1, r14 + br .Lr4 + +L(gt4): + {.mmi; nop 0 + nop 0 + FSH r24 = r10, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; nop 0 + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt8) + ;; } + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + br .Lr8 } + +L(gt8): + or r15 = r27, r26 + FSH r24 = r18, cnt + ld8 r18 = [up], UPD + sub r31 = -1, r14 + BSH r25 = r19, tnc + br .LL00 + +.Lb01: + br.cloop.dptk L(gt1) + ;; + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + sub r31 = -1, r22 + br .Lr1 + ;; +L(gt1): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt5) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + nop 0 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + or r15 = r23, r22 + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + br .Lr5 + +L(gt5): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; or r14 = r25, r24 + sub r31 = -1, r15 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br L(end) + ;; } + +.Lb10: + ld8 r17 = [up], UPD + br.cloop.dptk L(gt2) + ;; + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + or r14 = r21, r20 + ;; + sub r31 = -1, r14 + br .Lr2 + ;; +L(gt2): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt6) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + or r14 = r21, r20 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + br .Lr6 + ;; } +L(gt6): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; or r15 = r23, r22 + sub r31 = -1, r14 + FSH r20 = r16, cnt +}{.mib; ld8 r16 = [up], UPD + BSH r21 = r17, tnc + br .LL10 +} + +.Lb11: + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + FSH r26 = r10, cnt + br.cloop.dptk L(gt3) + ;; + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + ;; + or r14 = r21, r20 + sub r31 = -1, r15 + br .Lr3 + ;; +L(gt3): + ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; +}{.mmi; nop 0 + nop 0 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt7) + ;; } + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + br .Lr7 +} +L(gt7): + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; or r14 = r21, r20 + sub r31 = -1, r15 + FSH r26 = r19, cnt +}{.mib; ld8 r19 = [up], UPD + BSH r27 = r16, tnc + br .LL11 +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +.LL01: + {.mmi; st8 [rp] = r31, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M0 + sub r31 = -1, r14 C M1 + BSH r25 = r19, tnc C I1 + ;; } +.LL00: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + sub r31 = -1, r15 + BSH r27 = r16, tnc + ;; } +.LL11: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + sub r31 = -1, r14 + BSH r21 = r17, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mmi; ld8 r17 = [up], UPD + sub r31 = -1, r15 + BSH r23 = r18, tnc + ;; } +L(end): lfetch [r11], PUPD + br.cloop.dptk L(top) +C *** MAIN LOOP END *** + + {.mmi; st8 [rp] = r31, UPD + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + nop 0 + ;; } +.Lr8: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + nop 0 + ;; } +.Lr7: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + nop 0 + ;; } +.Lr6: st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + ;; +.Lr5: st8 [rp] = r31, UPD + or r15 = r27, r26 + sub r31 = -1, r14 + ;; +.Lr4: st8 [rp] = r31, UPD + or r14 = r21, r20 + sub r31 = -1, r15 + ;; +.Lr3: st8 [rp] = r31, UPD + sub r31 = -1, r14 + ;; +.Lr2: st8 [rp] = r31, UPD + sub r31 = -1, r22 + ;; +.Lr1: st8 [rp] = r31, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END() -- cgit v1.2.3