9 files changed, 2051 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/README b/gmp-6.3.0/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000..b214ac5
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/README
@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264.  The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c.  10 cycles after the last load, we can increase to 4 i/c.  This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+		current		fair		best
+Routine		c/l  unroll	c/l  unroll	c/l  i/c
+mul_1		3.25		2.75		2.75 3.273
+addmul_1	4.0	4	3.5	4 14	3.25 3.385
+addmul_2	4.0	1	2.5	2 10	2.25 3.333
+addmul_3	3.0	1	2.33	2 14	2    3.333
+addmul_4	2.5	1	2.125	2 17	2    3.135
+
+addmul_5			2	1 10
+addmul_6			2	1 12
+addmul_7			2	1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1	 addmul_1
+2	 addmul_2
+3	 addmul_3
+4	 addmul_4
+5	 addmul_3 + addmul_2	2.3998
+6	 addmul_4 + addmul_2
+7	 addmul_4 + addmul_3
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000..711d4e6
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm
@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_addmul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000..6ff6b3a
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm
@@ -0,0 +1,146 @@
+dnl  Alpha ev6 nails mpn_addmul_2.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
+	srl	m1a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	bis	r31,	m1b,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	unop
+	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
+	stq	r28,	-8(rp)			C L1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	bis	r31,	m1b,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	r0
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000..a1ffb68
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm
@@ -0,0 +1,169 @@
+dnl  Alpha ev6 nails mpn_addmul_3.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	ldq	rlimb,	0(rp)			C L1
+	ldq	ulimb,	0(up)			C L0
+	bis	r31,	r31,	r31		C U0	nop
+	addq	r19,	acc0,	acc0		C U1	propagate nail
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L0	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	bis	r31,	r31,	r31		C L1	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L0
+	bis	r31,	m2b,	acc2		C L1
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	stq	r28,	-8(rp)			C L
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	bis	r31,	m2b,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	m0a
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000..77e02a4
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm
@@ -0,0 +1,210 @@
+dnl  Alpha ev6 nails mpn_addmul_4.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+	lda	r30,	-240(r30)
+	stq	r12,	32(r30)
+	stq	r13,	40(r30)
+	stq	r14,	48(r30)
+	stq	r15,	56(r30)
+
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+	ldq	v3,	24(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	acc3		C	zero acc3
+	sll	v3,NAIL_BITS,	v3
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	mulq	v3,	ulimb,	m3a		C U1
+	umulh	v3,	ulimb,	m3b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+
+	bis	r31,	r31,	r31		C L0	nop
+	bis	r31,	r31,	r31		C U1	nop
+	bis	r31,	r31,	r31		C L1	nop
+	bis	r31,	r31,	r31		C U0	nop
+
+	lda	rp,	8(rp)			C L0
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L1
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L0
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L1	nop
+
+	addq	rlimb,	r19,	r19		C L0
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L1	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L0
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L1	extract numb part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L1
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L1
+	addq	m2b,	acc3,	acc2		C L0
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)			C L1
+	mulq	v3,	ulimb,	m3a		C U1
+
+	addq	r8,	acc2,	acc2		C L0
+	bis	r31,	m3b,	acc3		C L1
+	umulh	v3,	ulimb,	m3b		C U1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)			C FIXME: DELETE
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	addq	m2b,	acc3,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)
+	addq	r8,	acc2,	acc2
+	bis	r31,	m3b,	acc3
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	acc2
+
+	and	acc2,numb_mask,	r28
+	stq	r28,	16(rp)
+	srl	acc2,NUMB_BITS,	r19
+	addq	r19,	acc3,	r0
+
+	ldq	r12,	32(r30)
+	ldq	r13,	40(r30)
+	ldq	r14,	48(r30)
+	ldq	r15,	56(r30)
+	lda	r30,	240(r30)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000..f658677
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm
@@ -0,0 +1,233 @@
+dnl  Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Runs at 2.5 cycles/limb.  It would be possible to reach 2.0 cycles/limb
+dnl  with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+	define(`OP',        addq)
+	define(`CYSH',`GMP_NUMB_BITS')
+	define(`func',  mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+	define(`OP',        subq)
+	define(`CYSH',63)
+	define(`func',  mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+	bis	r31,	r31,	r20
+
+	and	n,	3,	r25
+	lda	n,	-4(n)
+	beq	r25,	L(ge4)
+
+L(lp0):	ldq	ul0,	0(up)
+	lda	up,	8(up)
+	ldq	vl0,	0(vp)
+	lda	vp,	8(vp)
+	lda	rp,	8(rp)
+	lda	r25,	-1(r25)
+	OP	ul0,	vl0,	rl0
+	OP	rl0,	r20,	rl0
+	and	rl0, numb_mask,	r28
+	stq	r28,	-8(rp)
+	srl	rl0,	CYSH,	r20
+	bne	r25,	L(lp0)
+
+	blt	n,	L(ret)
+
+L(ge4):	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	bge	n,	L(ge8)
+
+	OP	ul0,	vl0,	rl0	C		main-add 0
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	br	r31,	L(cj0)
+
+L(ge8):	OP	ul0,	vl0,	rl0	C		main-add 0
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	lda	rp,	32(rp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	blt	n,	L(end)
+
+	ALIGN(32)
+L(top):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	bis	r31,	r31,	r31
+
+	bis	r31,	r31,	r31
+	lda	n,	-4(n)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+
+	bis	r31,	r31,	r31
+	bis	r31,	r31,	r31
+	lda	rp,	32(rp)
+	bge	n,	L(top)
+
+L(end):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+L(cj0):	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	and	rl3,numb_mask,	r28
+	stq	r27,	16(rp)
+	stq	r28,	24(rp)
+
+L(ret):	and	r20,	1,	r0
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000..7949fe8
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h
@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD             40
+#define MUL_TOOM33_THRESHOLD            236
+
+#define SQR_BASECASE_THRESHOLD            7  /* karatsuba */
+#define SQR_TOOM2_THRESHOLD               0  /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD             120
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIV_DC_THRESHOLD                 48
+#define POWM_THRESHOLD                  113
+
+#define HGCD_THRESHOLD                   78
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                392
+#define JACOBI_BASE_METHOD                1
+
+#define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* no preinv with nails */
+#define USE_PREINV_DIVREM_1               0  /* no preinv with nails */
+#define USE_PREINV_MOD_1                  0  /* no preinv with nails */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             15
+#define GET_STR_PRECOMPUTE_THRESHOLD     24
+#define SET_STR_THRESHOLD              6336
+
+#define MUL_FFT_TABLE  { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD          488
+#define MUL_FFT_THRESHOLD              3712
+
+#define SQR_FFT_TABLE  { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD          480
+#define SQR_FFT_THRESHOLD              2976
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000..da2ee3d
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm
@@ -0,0 +1,364 @@
+dnl  Alpha ev6 nails mpn_mul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.25
+
+C TODO
+C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	lda	n,	-4(n)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000..f473a59
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_submul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	subq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	subq	m1b,	t1,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	subq	m1b,	t1,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()