aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/alpha/ev6/nails
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/alpha/ev6/nails')
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/README65
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm396
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm146
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm169
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm210
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm233
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h72
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm364
-rw-r--r--gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm396
9 files changed, 2051 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/README b/gmp-6.3.0/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000..b214ac5
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/README
@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264. The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+ current fair best
+Routine c/l unroll c/l unroll c/l i/c
+mul_1 3.25 2.75 2.75 3.273
+addmul_1 4.0 4 3.5 4 14 3.25 3.385
+addmul_2 4.0 1 2.5 2 10 2.25 3.333
+addmul_3 3.0 1 2.33 2 14 2 3.333
+addmul_4 2.5 1 2.125 2 17 2 3.135
+
+addmul_5 2 1 10
+addmul_6 2 1 12
+addmul_7 2 1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1 addmul_1
+2 addmul_2
+3 addmul_3
+4 addmul_4
+5 addmul_3 + addmul_2 2.3998
+6 addmul_4 + addmul_2
+7 addmul_4 + addmul_3
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000..711d4e6
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_addmul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ addq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ addq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000..6ff6b3a
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,146 @@
+dnl Alpha ev6 nails mpn_addmul_2.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ addq r19, acc0, acc0 C U0 propagate nail
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1 FINAL PROD-SUM
+ srl m1a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ bis r31, m1b, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ unop
+ srl r19,NUMB_BITS, r19 C U1 extract nail part
+ stq r28, -8(rp) C L1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ bis r31, m1b, acc1
+ and r19,numb_mask, r28 C extract limb
+
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, r0
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000..a1ffb68
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,169 @@
+dnl Alpha ev6 nails mpn_addmul_3.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): ldq rlimb, 0(rp) C L1
+ ldq ulimb, 0(up) C L0
+ bis r31, r31, r31 C U0 nop
+ addq r19, acc0, acc0 C U1 propagate nail
+
+ lda rp, 8(rp) C L1
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L0
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L1
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L0 nop
+
+ addq rlimb, r19, r19 C L1
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L0 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L1
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L0 extract numb part
+
+ bis r31, r31, r31 C L1 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L0
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L0
+ bis r31, m2b, acc2 C L1
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ stq r28, -8(rp) C L
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp)
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ bis r31, m2b, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ stq r28, -8(rp)
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, m0a
+
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000..77e02a4
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,210 @@
+dnl Alpha ev6 nails mpn_addmul_4.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+ lda r30, -240(r30)
+ stq r12, 32(r30)
+ stq r13, 40(r30)
+ stq r14, 48(r30)
+ stq r15, 56(r30)
+
+ lda numb_mask,-1(r31)
+ srl numb_mask,NAIL_BITS,numb_mask
+
+ ldq v0, 0(vp)
+ ldq v1, 8(vp)
+ ldq v2, 16(vp)
+ ldq v3, 24(vp)
+
+ bis r31, r31, acc0 C zero acc0
+ sll v0,NAIL_BITS, v0
+ bis r31, r31, acc1 C zero acc1
+ sll v1,NAIL_BITS, v1
+ bis r31, r31, acc2 C zero acc2
+ sll v2,NAIL_BITS, v2
+ bis r31, r31, acc3 C zero acc3
+ sll v3,NAIL_BITS, v3
+ bis r31, r31, r19
+
+ ldq ulimb, 0(up)
+ lda up, 8(up)
+ mulq v0, ulimb, m0a C U1
+ umulh v0, ulimb, m0b C U1
+ mulq v1, ulimb, m1a C U1
+ umulh v1, ulimb, m1b C U1
+ lda n, -1(n)
+ mulq v2, ulimb, m2a C U1
+ umulh v2, ulimb, m2b C U1
+ mulq v3, ulimb, m3a C U1
+ umulh v3, ulimb, m3b C U1
+ beq n, L(end) C U0
+
+ ALIGN(16)
+L(top): bis r31, r31, r31 C U1 nop
+ ldq rlimb, 0(rp) C L0
+ ldq ulimb, 0(up) C L1
+ addq r19, acc0, acc0 C U0 propagate nail
+
+ bis r31, r31, r31 C L0 nop
+ bis r31, r31, r31 C U1 nop
+ bis r31, r31, r31 C L1 nop
+ bis r31, r31, r31 C U0 nop
+
+ lda rp, 8(rp) C L0
+ srl m0a,NAIL_BITS, r8 C U0
+ lda up, 8(up) C L1
+ mulq v0, ulimb, m0a C U1
+
+ addq r8, acc0, r19 C U0
+ addq m0b, acc1, acc0 C L0
+ umulh v0, ulimb, m0b C U1
+ bis r31, r31, r31 C L1 nop
+
+ addq rlimb, r19, r19 C L0
+ srl m1a,NAIL_BITS, r8 C U0
+ bis r31, r31, r31 C L1 nop
+ mulq v1, ulimb, m1a C U1
+
+ addq r8, acc0, acc0 C U0
+ addq m1b, acc2, acc1 C L0
+ umulh v1, ulimb, m1b C U1
+ and r19,numb_mask, r28 C L1 extract numb part
+
+ bis r31, r31, r31 C L0 nop
+ srl m2a,NAIL_BITS, r8 C U0
+ lda n, -1(n) C L1
+ mulq v2, ulimb, m2a C U1
+
+ addq r8, acc1, acc1 C L1
+ addq m2b, acc3, acc2 C L0
+ umulh v2, ulimb, m2b C U1
+ srl r19,NUMB_BITS, r19 C U0 extract nail part
+
+ bis r31, r31, r31 C L0 nop
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp) C L1
+ mulq v3, ulimb, m3a C U1
+
+ addq r8, acc2, acc2 C L0
+ bis r31, m3b, acc3 C L1
+ umulh v3, ulimb, m3b C U1
+ bne n, L(top) C U0
+
+L(end): ldq rlimb, 0(rp)
+ addq r19, acc0, acc0 C propagate nail
+ lda rp, 8(rp) C FIXME: DELETE
+ srl m0a,NAIL_BITS, r8 C U0
+ addq r8, acc0, r19
+ addq m0b, acc1, acc0
+ addq rlimb, r19, r19
+ srl m1a,NAIL_BITS, r8 C U0
+ addq r8, acc0, acc0
+ addq m1b, acc2, acc1
+ and r19,numb_mask, r28 C extract limb
+ srl m2a,NAIL_BITS, r8 C U0
+ addq r8, acc1, acc1
+ addq m2b, acc3, acc2
+ srl r19,NUMB_BITS, r19 C extract nail
+ srl m3a,NAIL_BITS, r8 C U0
+ stq r28, -8(rp)
+ addq r8, acc2, acc2
+ bis r31, m3b, acc3
+
+ addq r19, acc0, acc0 C propagate nail
+ and acc0,numb_mask, r28
+ stq r28, 0(rp)
+ srl acc0,NUMB_BITS, r19
+ addq r19, acc1, acc1
+
+ and acc1,numb_mask, r28
+ stq r28, 8(rp)
+ srl acc1,NUMB_BITS, r19
+ addq r19, acc2, acc2
+
+ and acc2,numb_mask, r28
+ stq r28, 16(rp)
+ srl acc2,NUMB_BITS, r19
+ addq r19, acc3, r0
+
+ ldq r12, 32(r30)
+ ldq r13, 40(r30)
+ ldq r14, 48(r30)
+ ldq r15, 56(r30)
+ lda r30, 240(r30)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000..f658677
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm
@@ -0,0 +1,233 @@
+dnl Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb
+dnl with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+ define(`OP', addq)
+ define(`CYSH',`GMP_NUMB_BITS')
+ define(`func', mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+ define(`OP', subq)
+ define(`CYSH',63)
+ define(`func', mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+ bis r31, r31, r20
+
+ and n, 3, r25
+ lda n, -4(n)
+ beq r25, L(ge4)
+
+L(lp0): ldq ul0, 0(up)
+ lda up, 8(up)
+ ldq vl0, 0(vp)
+ lda vp, 8(vp)
+ lda rp, 8(rp)
+ lda r25, -1(r25)
+ OP ul0, vl0, rl0
+ OP rl0, r20, rl0
+ and rl0, numb_mask, r28
+ stq r28, -8(rp)
+ srl rl0, CYSH, r20
+ bne r25, L(lp0)
+
+ blt n, L(ret)
+
+L(ge4): ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ bge n, L(ge8)
+
+ OP ul0, vl0, rl0 C main-add 0
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ br r31, L(cj0)
+
+L(ge8): OP ul0, vl0, rl0 C main-add 0
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+ OP rl0, r20, rl0 C cy-add 0
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ lda rp, 32(rp)
+ lda up, 32(up)
+ lda vp, 32(vp)
+ lda n, -4(n)
+ blt n, L(end)
+
+ ALIGN(32)
+L(top): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ ldq ul0, 0(up)
+ ldq vl0, 0(vp)
+
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ bis r31, r31, r31
+
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ ldq ul1, 8(up)
+ ldq vl1, 8(vp)
+
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+ bis r31, r31, r31
+
+ OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ ldq ul2, 16(up)
+ ldq vl2, 16(vp)
+
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ bis r31, r31, r31
+
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ ldq ul3, 24(up)
+ ldq vl3, 24(vp)
+
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+ bis r31, r31, r31
+
+ bis r31, r31, r31
+ lda n, -4(n)
+ lda up, 32(up)
+ lda vp, 32(vp)
+
+ bis r31, r31, r31
+ bis r31, r31, r31
+ lda rp, 32(rp)
+ bge n, L(top)
+
+L(end): OP ul0, vl0, rl0 C main-add 0
+ srl rl3, CYSH, r20 C gen cy 3
+ OP rl0, r20, rl0 C cy-add 0
+ and rl3,numb_mask, r28
+ stq r27, -16(rp)
+ OP ul1, vl1, rl1 C main-add 1
+ srl rl0, CYSH, r20 C gen cy 0
+ OP rl1, r20, rl1 C cy-add 1
+ and rl0,numb_mask, r27
+ stq r28, -8(rp)
+L(cj0): OP ul2, vl2, rl2 C main-add 2
+ srl rl1, CYSH, r20 C gen cy 1
+ OP rl2, r20, rl2 C cy-add 2
+ and rl1,numb_mask, r28
+ stq r27, 0(rp)
+ OP ul3, vl3, rl3 C main-add 3
+ srl rl2, CYSH, r20 C gen cy 2
+ OP rl3, r20, rl3 C cy-add 3
+ and rl2,numb_mask, r27
+ stq r28, 8(rp)
+
+ srl rl3, CYSH, r20 C gen cy 3
+ and rl3,numb_mask, r28
+ stq r27, 16(rp)
+ stq r28, 24(rp)
+
+L(ret): and r20, 1, r0
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000..7949fe8
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD 40
+#define MUL_TOOM33_THRESHOLD 236
+
+#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */
+#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD 120
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIV_DC_THRESHOLD 48
+#define POWM_THRESHOLD 113
+
+#define HGCD_THRESHOLD 78
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 392
+#define JACOBI_BASE_METHOD 1
+
+#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */
+#define USE_PREINV_MOD_1 0 /* no preinv with nails */
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always */
+
+#define GET_STR_DC_THRESHOLD 15
+#define GET_STR_PRECOMPUTE_THRESHOLD 24
+#define SET_STR_THRESHOLD 6336
+
+#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD 488
+#define MUL_FFT_THRESHOLD 3712
+
+#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD 480
+#define SQR_FFT_THRESHOLD 2976
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000..da2ee3d
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm
@@ -0,0 +1,364 @@
+dnl Alpha ev6 nails mpn_mul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.25
+
+C TODO
+C * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m1a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ lda n, -4(n) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000..f473a59
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,396 @@
+dnl Alpha ev6 nails mpn_submul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 4
+
+C TODO
+C * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq rl0, 16(rp)
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ subq rl1, t0, acc1
+ and acc1,numb_mask, r28
+ sra acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ subq m1b, t1, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq rl1, 24(rp)
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl1, acc1, acc1 C U0
+ ldq rl2, 0(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl2, acc0, acc0 C U0
+ ldq rl3, 8(rp) C L1
+C
+ lda n, -4(n) C L1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ sra acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ subq rl3, acc1, acc1 C U0
+ ldq rl0, 16(rp) C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ sra acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ subq rl0, acc0, acc0 C U0
+ ldq rl1, 24(rp) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ ldq rl2, 0(rp)
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ subq rl2, acc0, acc0
+ ldq rl3, 8(rp)
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ unop
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ subq rl3, acc1, acc1
+ ldq rl0, 16(rp)
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ sra acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ subq rl0, acc0, acc0
+ ldq rl1, 24(rp)
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ sra acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ subq rl1, acc1, acc1
+ addq t1, acc1, acc1
+ sra acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ subq m1b, t1, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()