aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/ia64/lorrshift.asm
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/ia64/lorrshift.asm
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/ia64/lorrshift.asm')
-rw-r--r--gmp-6.3.0/mpn/ia64/lorrshift.asm358
1 files changed, 358 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/ia64/lorrshift.asm b/gmp-6.3.0/mpn/ia64/lorrshift.asm
new file mode 100644
index 0000000..694aaf0
--- /dev/null
+++ b/gmp-6.3.0/mpn/ia64/lorrshift.asm
@@ -0,0 +1,358 @@
+dnl IA-64 mpn_lshift/mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C Itanium: 2
+C Itanium 2: 1
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+ifdef(`OPERATION_lshift',`
+ define(`FSH',`shl')
+ define(`BSH',`shr.u')
+ define(`UPD',`-8')
+ define(`POFF',`-512')
+ define(`PUPD',`-32')
+ define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+ define(`FSH',`shr.u')
+ define(`BSH',`shl')
+ define(`UPD',`8')
+ define(`POFF',`512')
+ define(`PUPD',`32')
+ define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+PROLOGUE(func)
+ .prologue
+ .save ar.lc, r2
+ .body
+ifdef(`HAVE_ABI_32',
+` addp4 rp = 0, rp C M I
+ addp4 up = 0, up C M I
+ sxt4 n = n C M I
+ nop.m 0
+ nop.m 0
+ zxt4 cnt = cnt C I
+ ;;
+')
+
+ {.mmi; cmp.lt p14, p15 = 4, n C M I
+ and r14 = 3, n C M I
+ mov.i r2 = ar.lc C I0
+}{.mmi; add r15 = -1, n C M I
+ sub tnc = 64, cnt C M I
+ add r16 = -5, n
+ ;;
+}{.mmi; cmp.eq p6, p0 = 1, r14 C M I
+ cmp.eq p7, p0 = 2, r14 C M I
+ shr.u n = r16, 2 C I0
+}{.mmi; cmp.eq p8, p0 = 3, r14 C M I
+ifdef(`OPERATION_lshift',
+` shladd up = r15, 3, up C M I
+ shladd rp = r15, 3, rp') C M I
+ ;;
+}{.mmi; add r11 = POFF, up C M I
+ ld8 r10 = [up], UPD C M01
+ mov.i ar.lc = n C I0
+}{.bbb;
+ (p6) br.dptk .Lb01
+ (p7) br.dptk .Lb10
+ (p8) br.dptk .Lb11
+ ;; }
+
+.Lb00: ld8 r19 = [up], UPD
+ ;;
+ ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ FSH r24 = r10, cnt
+ BSH r25 = r19, tnc
+ (p14) br.cond.dptk .grt4
+ ;;
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ BSH r23 = r10, tnc
+ br .Lr4
+
+.grt4: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ or r14 = r25, r24
+ ld8 r17 = [up], UPD
+ br.cloop.dpnt .Ltop
+ br .Lbot
+
+.Lb01:
+ (p15) BSH r8 = r10, tnc C function return value I
+ (p15) FSH r22 = r10, cnt C I
+ (p15) br.cond.dptk .Lr1 C return B
+
+.grt1: ld8 r18 = [up], UPD
+ ;;
+ ld8 r19 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r10, cnt
+ BSH r23 = r18, tnc
+ ;;
+ ld8 r17 = [up], UPD
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ br.cloop.dpnt .grt5
+ ;;
+ or r15 = r23, r22
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ br .Lr5
+
+.grt5: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r15 = r23, r22
+ ld8 r16 = [up], UPD
+ br .LL01
+
+
+.Lb10: ld8 r17 = [up], UPD
+ (p14) br.cond.dptk .grt2
+
+ BSH r8 = r10, tnc C function return value
+ ;;
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r14 = r21, r20
+ FSH r22 = r17, cnt
+ br .Lr2 C return
+
+.grt2: ld8 r18 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r10, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ {.mmi; ld8 r17 = [up], UPD
+ or r14 = r21, r20
+ FSH r24 = r18, cnt
+}{.mib; nop 0
+ BSH r25 = r19, tnc
+ br.cloop.dpnt .grt6
+ ;; }
+
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ br .Lr6
+
+.grt6: ld8 r18 = [up], UPD
+ FSH r26 = r19, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ br .LL10
+
+
+.Lb11: ld8 r16 = [up], UPD
+ ;;
+ ld8 r17 = [up], UPD
+ BSH r8 = r10, tnc C function return value
+ (p14) br.cond.dptk .grt3
+ ;;
+
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
+ ;;
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ or r15 = r27, r26
+ FSH r22 = r17, cnt
+ br .Lr3 C return
+
+.grt3: ld8 r18 = [up], UPD
+ FSH r26 = r10, cnt
+ BSH r27 = r16, tnc
+ ;;
+ ld8 r19 = [up], UPD
+ FSH r20 = r16, cnt
+ BSH r21 = r17, tnc
+ ;;
+ ld8 r16 = [up], UPD
+ FSH r22 = r17, cnt
+ BSH r23 = r18, tnc
+ ;;
+ ld8 r17 = [up], UPD
+ br.cloop.dpnt .grt7
+
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ br .Lr7
+
+.grt7: or r15 = r27, r26
+ FSH r24 = r18, cnt
+ BSH r25 = r19, tnc
+ ld8 r18 = [up], UPD
+ br .LL11
+
+C *** MAIN LOOP START ***
+ ALIGN(32)
+.Ltop:
+ {.mmi; st8 [rp] = r14, UPD C M2
+ or r15 = r27, r26 C M3
+ FSH r24 = r18, cnt C I0
+}{.mmi; ld8 r18 = [up], UPD C M1
+ lfetch [r11], PUPD
+ BSH r25 = r19, tnc C I1
+ ;; }
+.LL11:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mmi; ld8 r19 = [up], UPD
+ nop.m 0
+ BSH r27 = r16, tnc
+ ;; }
+.LL10:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mmi; ld8 r16 = [up], UPD
+ nop.m 0
+ BSH r21 = r17, tnc
+ ;; }
+.LL01:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+}{.mib; ld8 r17 = [up], UPD
+ BSH r23 = r18, tnc
+ br.cloop.dptk .Ltop
+ ;; }
+C *** MAIN LOOP END ***
+
+.Lbot:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r27, r26
+ FSH r24 = r18, cnt
+}{.mib; nop 0
+ BSH r25 = r19, tnc
+ nop 0
+ ;; }
+.Lr7:
+ {.mmi; st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ FSH r26 = r19, cnt
+}{.mib; nop 0
+ BSH r27 = r16, tnc
+ nop 0
+ ;; }
+.Lr6:
+ {.mmi; st8 [rp] = r14, UPD
+ or r15 = r23, r22
+ FSH r20 = r16, cnt
+}{.mib; nop 0
+ BSH r21 = r17, tnc
+ nop 0
+ ;; }
+.Lr5: st8 [rp] = r15, UPD
+ or r14 = r25, r24
+ FSH r22 = r17, cnt
+ ;;
+.Lr4: st8 [rp] = r14, UPD
+ or r15 = r27, r26
+ ;;
+.Lr3: st8 [rp] = r15, UPD
+ or r14 = r21, r20
+ ;;
+.Lr2: st8 [rp] = r14, UPD
+ ;;
+.Lr1: st8 [rp] = r22, UPD C M23
+ mov ar.lc = r2 C I0
+ br.ret.sptk.many b0 C B
+EPILOGUE(func)
+ASM_END()