9 files changed, 1889 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/sparc32/v9/README b/gmp-6.3.0/mpn/sparc32/v9/README
new file mode 100644
index 0000000..9b39713
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch.  For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/gmp-6.3.0/mpn/sparc32/v9/add_n.asm b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm
new file mode 100644
index 0000000..7bd5974
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm
@@ -0,0 +1,129 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers.  It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed.  That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	lduw	[s1p+0],%o4
+	lduw	[s2p+0],%o5
+	addcc	n,-2,n
+	bl,pn	%icc,L(end1)
+	lduw	[s1p+4],%g2
+	lduw	[s2p+4],%g3
+	be,pn	%icc,L(end2)
+	mov	0,cy
+
+	.align	16
+L(loop):
+	add	%o4,%o5,%g4
+	add	rp,8,rp
+	lduw	[s1p+8],%o4
+	fitod	%f0,%f2
+C ---
+	add	cy,%g4,%g4
+	addcc	n,-1,n
+	lduw	[s2p+8],%o5
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,32,cy
+	add	s2p,8,s2p
+	stw	%g4,[rp-8]
+	be,pn	%icc,L(exito)+4
+C ---
+	add	%g2,%g3,%g4
+	addcc	n,-1,n
+	lduw	[s1p+12],%g2
+	fitod	%f0,%f2
+C ---
+	add	cy,%g4,%g4
+	add	s1p,8,s1p
+	lduw	[s2p+4],%g3
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,32,cy
+	bne,pt	%icc,L(loop)
+	stw	%g4,[rp-4]
+C ---
+L(exite):
+	add	%o4,%o5,%g4
+	add	cy,%g4,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp+0]
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,32,%o0
+
+L(exito):
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp-4]
+	add	%o4,%o5,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,32,%o0
+
+L(end1):
+	add	%o4,%o5,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,32,%o0
+
+L(end2):
+	add	%o4,%o5,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp+0]
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,32,%o0
+EPILOGUE(mpn_add_n)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000..2adf7a8
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,306 @@
+dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	lduw	[%o0], %g5		C read rp[i]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	nop
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	nop
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	nop
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	add	%o1, 4, %o1		C up++
+	stw	%g4, [%o0-4]
+	fanop
+C -- 6
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+4], %g5		C read rp[i]
+
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+8], %g5		C read rp[i]
+
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+12], %g5		C read rp[i]
+
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+16], %g5		C read rp[i]
+
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	mov	%g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000..f909e2c
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,204 @@
+/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         13
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     32
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                43
+#define MUL_TOOM44_THRESHOLD               126
+#define MUL_TOOM6H_THRESHOLD               161
+#define MUL_TOOM8H_THRESHOLD               208
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      80
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      55
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      72
+
+#define SQR_BASECASE_THRESHOLD               4
+#define SQR_TOOM2_THRESHOLD                 64
+#define SQR_TOOM3_THRESHOLD                 85
+#define SQR_TOOM4_THRESHOLD                152
+#define SQR_TOOM6_THRESHOLD                185
+#define SQR_TOOM8_THRESHOLD                324
+
+#define MULMID_TOOM42_THRESHOLD             64
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             288  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    288, 5}, {      9, 4}, {     19, 5}, {     11, 6}, \
+    {      6, 5}, {     14, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     20, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 8}, {     11, 7}, {     23, 9}, \
+    {      7, 8}, {     15, 7}, {     31, 8}, {     19, 7}, \
+    {     39, 8}, {     27, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135, 8}, {    271, 9}, \
+    {    143, 8}, {    287,10}, {     79, 9}, {    175,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    175,11}, {     95,10}, {    191, 9}, {    415, 8}, \
+    {    831,12}, {     63,11}, {    127,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895, 8}, {   1791,12}, {    127,11}, \
+    {    287,10}, {    607, 9}, {   1215, 8}, {   2431,11}, \
+    {    319, 9}, {   1279,11}, {    351,12}, {    191,11}, \
+    {    415,10}, {    831,11}, {    447,10}, {    895, 9}, \
+    {   1791,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    703,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    895,10}, {   1791,11}, {    959,13}, {    255,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    703,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1407,11}, {   2943,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 143
+#define MUL_FFT_THRESHOLD                 2240
+
+#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    244, 5}, {      8, 4}, {     17, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     20, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 9}, {     15, 8}, {     39, 9}, \
+    {     23,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     47,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     71, 8}, {    143, 7}, {    287, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    143, 8}, {    287,10}, {     79, 9}, \
+    {    159, 8}, {    319, 9}, {    175, 8}, {    351, 7}, \
+    {    703,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207, 8}, {    415, 9}, {    223,11}, {     63,10}, \
+    {    127, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,10}, {    175, 9}, \
+    {    351, 8}, {    703,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415, 8}, {    831,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    351, 9}, {    703, 8}, \
+    {   1407,11}, {    191,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    447, 9}, {    895,10}, {    479,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    415,10}, {    831,11}, \
+    {    447,10}, {    895, 9}, {   1791,13}, {    127,12}, \
+    {    255,11}, {    575,12}, {    319,11}, {    703,10}, \
+    {   1407,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    959,10}, {   1919, 9}, {   3839,13}, {    255,12}, \
+    {    575,11}, {   1151,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD                 2112
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 144
+#define MULLO_MUL_N_THRESHOLD             4292
+
+#define DC_DIV_QR_THRESHOLD                 74
+#define DC_DIVAPPR_Q_THRESHOLD             406
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                363
+
+#define INV_MULMOD_BNM1_THRESHOLD          108
+#define INV_NEWTON_THRESHOLD               351
+#define INV_APPR_THRESHOLD                 303
+
+#define BINV_NEWTON_THRESHOLD              354
+#define REDC_1_TO_REDC_N_THRESHOLD          61
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD            1099
+#define MUPI_DIV_QR_THRESHOLD              118
+#define MU_BDIV_QR_THRESHOLD               807
+#define MU_BDIV_Q_THRESHOLD                979
+
+#define POWM_SEC_TABLE  3,22,127,624,779,2351
+
+#define MATRIX22_STRASSEN_THRESHOLD          7
+#define HGCD_THRESHOLD                      90
+#define HGCD_APPR_THRESHOLD                123
+#define HGCD_REDUCE_THRESHOLD             1494
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                192
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               290
+#define SET_STR_PRECOMPUTE_THRESHOLD       634
+
+#define FAC_DSC_THRESHOLD                  156
+#define FAC_ODD_THRESHOLD                   25
diff --git a/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000..40aeffa
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,287 @@
+dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	nop
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	nop
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	srlx	%g4, 32, %g3		C new cy
+	add	%o1, 4, %o1		C up++
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	stw	%g4, [%o0-4]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	mov	%g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_mul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm
new file mode 100644
index 0000000..e024279
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm
@@ -0,0 +1,462 @@
+dnl  SPARC v9 32-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+
+C This code uses a very deep software pipeline, due to the need for moving data
+C forth and back between the integer registers and floating-point registers.
+C
+C A VIS variant of this code would make the pipeline less deep, since the
+C masking now done in the integer unit could take place in the floating-point
+C unit using the FAND instruction.  It would be possible to save several cycles
+C too.
+C
+C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
+C not much slower from the Ecache.  It would perhaps be possible to shave off
+C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
+C used instructions, since we have 10 memory operations per limb.  But a VIS
+C variant could run three cycles faster than the corresponding non-VIS code.
+
+C This is non-pipelined code showing the algorithm:
+C
+C .Loop:
+C	lduw	[up+0],%g4		C 00000000hhhhllll
+C	sllx	%g4,16,%g3		C 0000hhhhllll0000
+C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+C	stx	%g2,[%fp+80]
+C	ldd	[%fp+80],%f0
+C	fitod	%f0,%f4			C hi16
+C	fitod	%f1,%f6			C lo16
+C	ld	[up+0],%f9
+C	fxtod	%f8,%f2
+C	fmuld	%f2,%f4,%f4
+C	fmuld	%f2,%f6,%f6
+C	fdtox	%f4,%f4
+C	fdtox	%f6,%f6
+C	std	%f4,[%fp-24]
+C	std	%f6,[%fp-16]
+C	ldx	[%fp-24],%g2
+C	ldx	[%fp-16],%g1
+C	sllx	%g2,16,%g2
+C	add	%g2,%g1,%g1
+C	stw	%g1,[rp+0]
+C	srlx	%g1,32,%l0
+C	stw	%l0,[rp+4]
+C	add	up,4,up
+C	subcc	n,1,n
+C	bne,pt	%icc,.Loop
+C	add	rp,8,rp
+
+define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+.Lnoll:
+	.word	0
+
+PROLOGUE(mpn_sqr_diagonal)
+	save	%sp,-256,%sp
+
+ifdef(`PIC',
+`.Lpc:	rd	%pc,%o7
+	ld	[%o7+.Lnoll-.Lpc],%f8',
+`	sethi	%hi(.Lnoll),%g1
+	ld	[%g1+%lo(.Lnoll)],%f8')
+
+	sethi	%hi(0xffff0000),%g5
+	add	%i1,-8,%i1
+
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_1
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	add	%i1,4,%i1		C s1_ptr++
+	stx	%g2,[%fp+80]
+	ld	[%i1],%f9
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	std	%f6,[%fp-16]
+
+	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.L1
+	add	%fp, -40, %l6
+
+.L_grt_1:
+	stx	%g2,[%fp+80]
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_2
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+
+	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	b	.L2
+	add	%fp, -24, %l6
+
+.L_grt_2:
+	stx	%g2,[%fp+72]
+	lduw	[%i1+8],%g4
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	fxtod	%f8,%f2
+	bne,pt	%icc,.L_grt_3
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	add	%fp, 80, %l3
+	fmuld	%f2,%f6,%f6
+	add	%fp, -24, %l4
+	ldd	[%fp+80],%f0
+	add	%fp, 72, %l5
+	fdtox	%f4,%f4
+	b	.L3
+	add	%fp, -40, %l6
+
+.L_grt_3:
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	bne,pt	%icc,.L_grt_4
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	add	%fp, 72, %l3
+	fmuld	%f2,%f4,%f4
+	add	%fp, -40, %l4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	add	%fp, 80, %l5
+	fdtox	%f4,%f4
+	b	.L4
+	add	%fp, -24, %l6
+
+.L_grt_4:
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-32]
+	be,pn	%icc,.L5
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	b,a	.Loop
+
+	.align	16
+C --- LOOP BEGIN
+.Loop:	nop
+	nop
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-24],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-16]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	be,pn	%icc,.Lend
+	fanop
+C ---  LOOP MIDDLE
+	nop
+	nop
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-40],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-32]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	bne,pt	%icc,.Loop
+	fanop
+C --- LOOP END
+
+.L5:	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.Ltail
+	add	%fp, -40, %l6
+
+.Lend:	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	add	%fp, -24, %l6
+.Ltail:	stx	%g2,[%l3]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%l5],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L4:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	fxtod	%f8,%f2
+	std	%f6,[%l4+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	ldd	[%l3],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L3:	fdtox	%f6,%f6
+	std	%f4,[%l6]
+	fxtod	%f8,%f2
+	std	%f6,[%l6+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L2:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	std	%f6,[%l4+8]
+
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+.L1:	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+	ret
+	restore	%g0,%g0,%o0
+
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm
new file mode 100644
index 0000000..636c73b
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm
@@ -0,0 +1,129 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers.  It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed.  That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	lduw	[s1p+0],%o4
+	lduw	[s2p+0],%o5
+	addcc	n,-2,n
+	bl,pn	%icc,L(end1)
+	lduw	[s1p+4],%g2
+	lduw	[s2p+4],%g3
+	be,pn	%icc,L(end2)
+	mov	0,cy
+
+	.align	16
+L(loop):
+	sub	%o4,%o5,%g4
+	add	rp,8,rp
+	lduw	[s1p+8],%o4
+	fitod	%f0,%f2
+C ---
+	sub	%g4,cy,%g4
+	addcc	n,-1,n
+	lduw	[s2p+8],%o5
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,63,cy
+	add	s2p,8,s2p
+	stw	%g4,[rp-8]
+	be,pn	%icc,L(exito)+4
+C ---
+	sub	%g2,%g3,%g4
+	addcc	n,-1,n
+	lduw	[s1p+12],%g2
+	fitod	%f0,%f2
+C ---
+	sub	%g4,cy,%g4
+	add	s1p,8,s1p
+	lduw	[s2p+4],%g3
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,63,cy
+	bne,pt	%icc,L(loop)
+	stw	%g4,[rp-4]
+C ---
+L(exite):
+	sub	%o4,%o5,%g4
+	sub	%g4,cy,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp+0]
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,63,%o0
+
+L(exito):
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp-4]
+	sub	%o4,%o5,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,63,%o0
+
+L(end1):
+	sub	%o4,%o5,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,63,%o0
+
+L(end2):
+	sub	%o4,%o5,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp+0]
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,63,%o0
+EPILOGUE(mpn_sub_n)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000..92d0ce7
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,316 @@
+dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	lduw	[%o0], %g5		C read rp[i]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	sub	%g0, %g3, %g3
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	nop
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	add	%o1, 4, %o1		C up++
+	stw	%g4, [%o0-4]
+	fanop
+C -- 6
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	sub	%g0, %g3, %g3
+	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+4], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+8], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+12], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+16], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	sub	%g0, %g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_submul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/udiv.asm b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm
new file mode 100644
index 0000000..61dde97
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm
@@ -0,0 +1,52 @@
+dnl  SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	sllx	%o1, 32, %g1		C shift upper dividend limb
+	srl	%o2, 0, %g2		C zero extend lower dividend limb
+	srl	%o3, 0, %g3		C zero extend divisor
+	or	%g2, %g1, %g1		C assemble 64-bit dividend
+	udivx	%g1, %g3, %g1
+	mulx	%g1, %g3, %g4
+	sub	%g2, %g4, %g2
+	st	%g2, [%o0]		C store remainder
+	retl
+	mov	%g1, %o0		C return quotient
+EPILOGUE(mpn_udiv_qrnnd)