aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/sparc32/v9
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/sparc32/v9')
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/README4
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/add_n.asm129
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm306
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h204
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/mul_1.asm287
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm462
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/sub_n.asm129
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/submul_1.asm316
-rw-r--r--gmp-6.3.0/mpn/sparc32/v9/udiv.asm52
9 files changed, 1889 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/sparc32/v9/README b/gmp-6.3.0/mpn/sparc32/v9/README
new file mode 100644
index 0000000..9b39713
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch. For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/gmp-6.3.0/mpn/sparc32/v9/add_n.asm b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm
new file mode 100644
index 0000000..7bd5974
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm
@@ -0,0 +1,129 @@
+dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl sum in a third limb vector.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers. It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed. That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ lduw [s1p+0],%o4
+ lduw [s2p+0],%o5
+ addcc n,-2,n
+ bl,pn %icc,L(end1)
+ lduw [s1p+4],%g2
+ lduw [s2p+4],%g3
+ be,pn %icc,L(end2)
+ mov 0,cy
+
+ .align 16
+L(loop):
+ add %o4,%o5,%g4
+ add rp,8,rp
+ lduw [s1p+8],%o4
+ fitod %f0,%f2
+C ---
+ add cy,%g4,%g4
+ addcc n,-1,n
+ lduw [s2p+8],%o5
+ fitod %f0,%f2
+C ---
+ srlx %g4,32,cy
+ add s2p,8,s2p
+ stw %g4,[rp-8]
+ be,pn %icc,L(exito)+4
+C ---
+ add %g2,%g3,%g4
+ addcc n,-1,n
+ lduw [s1p+12],%g2
+ fitod %f0,%f2
+C ---
+ add cy,%g4,%g4
+ add s1p,8,s1p
+ lduw [s2p+4],%g3
+ fitod %f0,%f2
+C ---
+ srlx %g4,32,cy
+ bne,pt %icc,L(loop)
+ stw %g4,[rp-4]
+C ---
+L(exite):
+ add %o4,%o5,%g4
+ add cy,%g4,%g4
+ srlx %g4,32,cy
+ stw %g4,[rp+0]
+ add %g2,%g3,%g4
+ add cy,%g4,%g4
+ stw %g4,[rp+4]
+ retl
+ srlx %g4,32,%o0
+
+L(exito):
+ add %g2,%g3,%g4
+ add cy,%g4,%g4
+ srlx %g4,32,cy
+ stw %g4,[rp-4]
+ add %o4,%o5,%g4
+ add cy,%g4,%g4
+ stw %g4,[rp+0]
+ retl
+ srlx %g4,32,%o0
+
+L(end1):
+ add %o4,%o5,%g4
+ stw %g4,[rp+0]
+ retl
+ srlx %g4,32,%o0
+
+L(end2):
+ add %o4,%o5,%g4
+ srlx %g4,32,cy
+ stw %g4,[rp+0]
+ add %g2,%g3,%g4
+ add cy,%g4,%g4
+ stw %g4,[rp+4]
+ retl
+ srlx %g4,32,%o0
+EPILOGUE(mpn_add_n)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000..2adf7a8
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,306 @@
+dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces. We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C cycles/limb
+C UltraSPARC 1&2: 6.5
+C UltraSPARC 3: ?
+
+C Possible optimizations:
+C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
+C memory bandwidth limited, this could save 1.5 cycles/limb.
+C 2. Unroll the inner loop. Since we already use alternate temporary areas,
+C it is very straightforward to unroll, using an exit branch midways.
+C Unrolling would allow deeper scheduling which could improve speed for L2
+C cache case.
+C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
+C aren't sufficiently apart-scheduled with just two temp areas.
+C 4. Specialize for particular v values. If its upper 16 bits are zero, we
+C could save many operations.
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ add %sp, -FSIZE, %sp
+ sethi %hi(0xffff), %g1
+ srl %o3, 16, %g2
+ or %g1, %lo(0xffff), %g1
+ and %o3, %g1, %g1
+ stx %g1, [%sp+104]
+ stx %g2, [%sp+112]
+ ldd [%sp+104], %f6
+ ldd [%sp+112], %f8
+ fxtod %f6, %f6
+ fxtod %f8, %f8
+ ld [%sp+104], %f10 C zero f10
+
+ mov 0, %g3 C cy = 0
+
+define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
+
+ add %sp, 160, %o5 C point in scratch area
+ and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
+
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_two_or_more
+ fxtod %f10, %f2
+
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ lduw [%o0], %g5 C read rp[i]
+ b .L1
+ add %o0, -16, %o0
+
+ .align 16
+.L_two_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_three_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ std %f12, [%o5+8]
+ lduw [%o0], %g5 C read rp[i]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ b .L2
+ add %o0, -12, %o0
+
+ .align 16
+.L_three_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_four_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ lduw [%o0], %g5 C read rp[i]
+ b .L3
+ add %o0, -8, %o0
+
+ .align 16
+.L_four_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_five_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ lduw [%o0], %g5 C read rp[i]
+ b .L4
+ add %o0, -4, %o0
+
+ .align 16
+.L_five_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ lduw [%o0], %g5 C read rp[i]
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+ b,a .L5
+
+C BEGIN MAIN LOOP
+ .align 16
+C -- 0
+.Loop: nop
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+C -- 1
+ sllx %g2, 16, %g4 C (p16 << 16)
+ add %o0, 4, %o0 C rp++
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+C -- 2
+ nop
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ fanop
+C -- 3
+ nop
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+C -- 4
+ nop
+ add %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+C -- 5
+ xor %o5, 16, %o5 C alternate scratch variables
+ add %o1, 4, %o1 C up++
+ stw %g4, [%o0-4]
+ fanop
+C -- 6
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0], %g5 C read rp[i]
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+C END MAIN LOOP
+
+.L5: fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g4, %g3, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ add %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ xor %o5, 16, %o5
+ stw %g4, [%o0+0]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+4], %g5 C read rp[i]
+
+.L4: fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ add %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ xor %o5, 16, %o5
+ stw %g4, [%o0+4]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+8], %g5 C read rp[i]
+
+.L3: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ add %g5, %g4, %g4 C p += rp[i]
+ xor %o5, 16, %o5
+ stw %g4, [%o0+8]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+12], %g5 C read rp[i]
+
+.L2: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ add %g5, %g4, %g4 C p += rp[i]
+ stw %g4, [%o0+12]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+16], %g5 C read rp[i]
+
+.L1: sllx %g2, 16, %g4 C (p16 << 16)
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ add %g3, %g4, %g4 C p += cy
+ add %g5, %g4, %g4 C p += rp[i]
+ stw %g4, [%o0+16]
+ srlx %g4, 32, %g3 C new cy
+
+ mov %g3, %o0
+ retl
+ sub %sp, -FSIZE, %sp
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000..f909e2c
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,204 @@
+/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */
+
+#define DIVREM_1_NORM_THRESHOLD 3
+#define DIVREM_1_UNNORM_THRESHOLD 4
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 13
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 32
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */
+
+#define MUL_TOOM22_THRESHOLD 28
+#define MUL_TOOM33_THRESHOLD 43
+#define MUL_TOOM44_THRESHOLD 126
+#define MUL_TOOM6H_THRESHOLD 161
+#define MUL_TOOM8H_THRESHOLD 208
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 80
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 55
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 72
+
+#define SQR_BASECASE_THRESHOLD 4
+#define SQR_TOOM2_THRESHOLD 64
+#define SQR_TOOM3_THRESHOLD 85
+#define SQR_TOOM4_THRESHOLD 152
+#define SQR_TOOM6_THRESHOLD 185
+#define SQR_TOOM8_THRESHOLD 324
+
+#define MULMID_TOOM42_THRESHOLD 64
+
+#define MULMOD_BNM1_THRESHOLD 12
+#define SQRMOD_BNM1_THRESHOLD 16
+
+#define MUL_FFT_MODF_THRESHOLD 288 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 288, 5}, { 9, 4}, { 19, 5}, { 11, 6}, \
+ { 6, 5}, { 14, 6}, { 8, 5}, { 17, 6}, \
+ { 9, 5}, { 20, 6}, { 13, 7}, { 7, 6}, \
+ { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \
+ { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \
+ { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \
+ { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \
+ { 39, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \
+ { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \
+ { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47,10}, { 31, 9}, { 71, 8}, \
+ { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \
+ { 143, 8}, { 287,10}, { 79, 9}, { 175,10}, \
+ { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
+ { 63,10}, { 143, 9}, { 287, 8}, { 575,10}, \
+ { 175,11}, { 95,10}, { 191, 9}, { 415, 8}, \
+ { 831,12}, { 63,11}, { 127,10}, { 287, 9}, \
+ { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \
+ { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 447, 9}, { 895, 8}, { 1791,12}, { 127,11}, \
+ { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \
+ { 319, 9}, { 1279,11}, { 351,12}, { 191,11}, \
+ { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \
+ { 1791,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \
+ { 703,12}, { 383,11}, { 831,12}, { 447,11}, \
+ { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 703,13}, \
+ { 383,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1087,11}, { 2175,12}, { 1215,11}, { 2431,13}, \
+ { 639,12}, { 1407,11}, { 2943,13}, { 895,12}, \
+ { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \
+ { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \
+ { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \
+ { 5887,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 143
+#define MUL_FFT_THRESHOLD 2240
+
+#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 244, 5}, { 8, 4}, { 17, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
+ { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \
+ { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \
+ { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \
+ { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \
+ { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \
+ { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 71, 8}, { 143, 7}, { 287, 9}, { 79,10}, \
+ { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 143, 8}, { 287,10}, { 79, 9}, \
+ { 159, 8}, { 319, 9}, { 175, 8}, { 351, 7}, \
+ { 703,10}, { 95, 9}, { 191, 8}, { 383, 9}, \
+ { 207, 8}, { 415, 9}, { 223,11}, { 63,10}, \
+ { 127, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
+ { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \
+ { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \
+ { 383,10}, { 207, 9}, { 415, 8}, { 831,10}, \
+ { 223,12}, { 63,11}, { 127,10}, { 271, 9}, \
+ { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 351, 9}, { 703, 8}, \
+ { 1407,11}, { 191,10}, { 415, 9}, { 831,11}, \
+ { 223,10}, { 447, 9}, { 895,10}, { 479,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 575,11}, { 319,10}, { 639,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 415,10}, { 831,11}, \
+ { 447,10}, { 895, 9}, { 1791,13}, { 127,12}, \
+ { 255,11}, { 575,12}, { 319,11}, { 703,10}, \
+ { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \
+ { 959,10}, { 1919, 9}, { 3839,13}, { 255,12}, \
+ { 575,11}, { 1151,12}, { 703,11}, { 1407,13}, \
+ { 383,12}, { 959,14}, { 255,13}, { 511,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1407,13}, \
+ { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \
+ { 511,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
+ { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1919,15}, { 511,14}, { 1023,13}, { 2431,14}, \
+ { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD 2112
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 144
+#define MULLO_MUL_N_THRESHOLD 4292
+
+#define DC_DIV_QR_THRESHOLD 74
+#define DC_DIVAPPR_Q_THRESHOLD 406
+#define DC_BDIV_QR_THRESHOLD 63
+#define DC_BDIV_Q_THRESHOLD 363
+
+#define INV_MULMOD_BNM1_THRESHOLD 108
+#define INV_NEWTON_THRESHOLD 351
+#define INV_APPR_THRESHOLD 303
+
+#define BINV_NEWTON_THRESHOLD 354
+#define REDC_1_TO_REDC_N_THRESHOLD 61
+
+#define MU_DIV_QR_THRESHOLD 998
+#define MU_DIVAPPR_Q_THRESHOLD 1099
+#define MUPI_DIV_QR_THRESHOLD 118
+#define MU_BDIV_QR_THRESHOLD 807
+#define MU_BDIV_Q_THRESHOLD 979
+
+#define POWM_SEC_TABLE 3,22,127,624,779,2351
+
+#define MATRIX22_STRASSEN_THRESHOLD 7
+#define HGCD_THRESHOLD 90
+#define HGCD_APPR_THRESHOLD 123
+#define HGCD_REDUCE_THRESHOLD 1494
+#define GCD_DC_THRESHOLD 283
+#define GCDEXT_DC_THRESHOLD 192
+#define JACOBI_BASE_METHOD 4
+
+#define GET_STR_DC_THRESHOLD 12
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 290
+#define SET_STR_PRECOMPUTE_THRESHOLD 634
+
+#define FAC_DSC_THRESHOLD 156
+#define FAC_ODD_THRESHOLD 25
diff --git a/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000..40aeffa
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,287 @@
+dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces. We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C cycles/limb
+C UltraSPARC 1&2: 6.5
+C UltraSPARC 3: ?
+
+C Possible optimizations:
+C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
+C memory bandwidth limited, this could save 1.5 cycles/limb.
+C 2. Unroll the inner loop. Since we already use alternate temporary areas,
+C it is very straightforward to unroll, using an exit branch midways.
+C Unrolling would allow deeper scheduling which could improve speed for L2
+C cache case.
+C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
+C aren't sufficiently apart-scheduled with just two temp areas.
+C 4. Specialize for particular v values. If its upper 16 bits are zero, we
+C could save many operations.
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ add %sp, -FSIZE, %sp
+ sethi %hi(0xffff), %g1
+ srl %o3, 16, %g2
+ or %g1, %lo(0xffff), %g1
+ and %o3, %g1, %g1
+ stx %g1, [%sp+104]
+ stx %g2, [%sp+112]
+ ldd [%sp+104], %f6
+ ldd [%sp+112], %f8
+ fxtod %f6, %f6
+ fxtod %f8, %f8
+ ld [%sp+104], %f10 C zero f10
+
+ mov 0, %g3 C cy = 0
+
+define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
+
+ add %sp, 160, %o5 C point in scratch area
+ and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
+
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_two_or_more
+ fxtod %f10, %f2
+
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ b .L1
+ add %o0, -16, %o0
+
+ .align 16
+.L_two_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_three_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ std %f12, [%o5+8]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ b .L2
+ add %o0, -12, %o0
+
+ .align 16
+.L_three_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_four_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ b .L3
+ add %o0, -8, %o0
+
+ .align 16
+.L_four_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_five_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ b .L4
+ add %o0, -4, %o0
+
+ .align 16
+.L_five_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+ b,a .L5
+
+C BEGIN MAIN LOOP
+ .align 16
+C -- 0
+.Loop: nop
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+C -- 1
+ sllx %g2, 16, %g4 C (p16 << 16)
+ add %o0, 4, %o0 C rp++
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+C -- 2
+ nop
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ fanop
+C -- 3
+ nop
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+C -- 4
+ srlx %g4, 32, %g3 C new cy
+ add %o1, 4, %o1 C up++
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+C -- 5
+ xor %o5, 16, %o5 C alternate scratch variables
+ stw %g4, [%o0-4]
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+C END MAIN LOOP
+
+.L5: fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g4, %g3, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ xor %o5, 16, %o5
+ stw %g4, [%o0+0]
+ srlx %g4, 32, %g3 C new cy
+
+.L4: fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ std %f12, [%o5+8]
+ xor %o5, 16, %o5
+ stw %g4, [%o0+4]
+ srlx %g4, 32, %g3 C new cy
+
+.L3: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ xor %o5, 16, %o5
+ stw %g4, [%o0+8]
+ srlx %g4, 32, %g3 C new cy
+
+.L2: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ stw %g4, [%o0+12]
+ srlx %g4, 32, %g3 C new cy
+
+.L1: sllx %g2, 16, %g4 C (p16 << 16)
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ add %g3, %g4, %g4 C p += cy
+ stw %g4, [%o0+16]
+ srlx %g4, 32, %g3 C new cy
+
+ mov %g3, %o0
+ retl
+ sub %sp, -FSIZE, %sp
+EPILOGUE(mpn_mul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm
new file mode 100644
index 0000000..e024279
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm
@@ -0,0 +1,462 @@
+dnl SPARC v9 32-bit mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+
+C This code uses a very deep software pipeline, due to the need for moving data
+C forth and back between the integer registers and floating-point registers.
+C
+C A VIS variant of this code would make the pipeline less deep, since the
+C masking now done in the integer unit could take place in the floating-point
+C unit using the FAND instruction. It would be possible to save several cycles
+C too.
+C
+C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
+C not much slower from the Ecache. It would perhaps be possible to shave off
+C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
+C used instructions, since we have 10 memory operations per limb. But a VIS
+C variant could run three cycles faster than the corresponding non-VIS code.
+
+C This is non-pipelined code showing the algorithm:
+C
+C .Loop:
+C lduw [up+0],%g4 C 00000000hhhhllll
+C sllx %g4,16,%g3 C 0000hhhhllll0000
+C or %g3,%g4,%g2 C 0000hhhhXXXXllll
+C andn %g2,%g5,%g2 C 0000hhhh0000llll
+C stx %g2,[%fp+80]
+C ldd [%fp+80],%f0
+C fitod %f0,%f4 C hi16
+C fitod %f1,%f6 C lo16
+C ld [up+0],%f9
+C fxtod %f8,%f2
+C fmuld %f2,%f4,%f4
+C fmuld %f2,%f6,%f6
+C fdtox %f4,%f4
+C fdtox %f6,%f6
+C std %f4,[%fp-24]
+C std %f6,[%fp-16]
+C ldx [%fp-24],%g2
+C ldx [%fp-16],%g1
+C sllx %g2,16,%g2
+C add %g2,%g1,%g1
+C stw %g1,[rp+0]
+C srlx %g1,32,%l0
+C stw %l0,[rp+4]
+C add up,4,up
+C subcc n,1,n
+C bne,pt %icc,.Loop
+C add rp,8,rp
+
+define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe
+
+ASM_START()
+
+ TEXT
+ ALIGN(4)
+.Lnoll:
+ .word 0
+
+PROLOGUE(mpn_sqr_diagonal)
+ save %sp,-256,%sp
+
+ifdef(`PIC',
+`.Lpc: rd %pc,%o7
+ ld [%o7+.Lnoll-.Lpc],%f8',
+` sethi %hi(.Lnoll),%g1
+ ld [%g1+%lo(.Lnoll)],%f8')
+
+ sethi %hi(0xffff0000),%g5
+ add %i1,-8,%i1
+
+ lduw [%i1+8],%g4
+ add %i1,4,%i1 C s1_ptr++
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ bne,pt %icc,.L_grt_1
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+
+ add %i1,4,%i1 C s1_ptr++
+ stx %g2,[%fp+80]
+ ld [%i1],%f9
+ ldd [%fp+80],%f0
+ fxtod %f8,%f2
+ fitod %f0,%f4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ fmuld %f2,%f6,%f6
+ fdtox %f4,%f4
+ fdtox %f6,%f6
+ std %f4,[%fp-24]
+ std %f6,[%fp-16]
+
+ add %fp, 80, %l3
+ add %fp, -24, %l4
+ add %fp, 72, %l5
+ b .L1
+ add %fp, -40, %l6
+
+.L_grt_1:
+ stx %g2,[%fp+80]
+ lduw [%i1+8],%g4
+ add %i1,4,%i1 C s1_ptr++
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ bne,pt %icc,.L_grt_2
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+
+ stx %g2,[%fp+72]
+ ld [%i1],%f9
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+80],%f0
+ fxtod %f8,%f2
+ fitod %f0,%f4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ ldd [%fp+72],%f0
+ fdtox %f4,%f4
+ fdtox %f6,%f6
+ std %f4,[%fp-24]
+ fxtod %f8,%f2
+ std %f6,[%fp-16]
+ fitod %f0,%f4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ fmuld %f2,%f6,%f6
+ fdtox %f4,%f4
+
+ add %fp, 72, %l3
+ add %fp, -40, %l4
+ add %fp, 80, %l5
+ b .L2
+ add %fp, -24, %l6
+
+.L_grt_2:
+ stx %g2,[%fp+72]
+ lduw [%i1+8],%g4
+ ld [%i1],%f9
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+80],%f0
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ fxtod %f8,%f2
+ bne,pt %icc,.L_grt_3
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+
+ stx %g2,[%fp+80]
+ fitod %f0,%f4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+72],%f0
+ fdtox %f4,%f4
+ fdtox %f6,%f6
+ std %f4,[%fp-24]
+ fxtod %f8,%f2
+ std %f6,[%fp-16]
+ fitod %f0,%f4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ ld [%i1],%f9
+ add %fp, 80, %l3
+ fmuld %f2,%f6,%f6
+ add %fp, -24, %l4
+ ldd [%fp+80],%f0
+ add %fp, 72, %l5
+ fdtox %f4,%f4
+ b .L3
+ add %fp, -40, %l6
+
+.L_grt_3:
+ stx %g2,[%fp+80]
+ fitod %f0,%f4
+ lduw [%i1+8],%g4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+72],%f0
+ fdtox %f4,%f4
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ fdtox %f6,%f6
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ std %f4,[%fp-24]
+ fxtod %f8,%f2
+ std %f6,[%fp-16]
+ bne,pt %icc,.L_grt_4
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+
+ stx %g2,[%fp+72]
+ fitod %f0,%f4
+ fitod %f1,%f6
+ add %fp, 72, %l3
+ fmuld %f2,%f4,%f4
+ add %fp, -40, %l4
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+80],%f0
+ add %fp, 80, %l5
+ fdtox %f4,%f4
+ b .L4
+ add %fp, -24, %l6
+
+.L_grt_4:
+ stx %g2,[%fp+72]
+ fitod %f0,%f4
+ lduw [%i1+8],%g4
+ fitod %f1,%f6
+ fmuld %f2,%f4,%f4
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+80],%f0
+ fdtox %f4,%f4
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ fdtox %f6,%f6
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ std %f4,[%fp-40]
+ fxtod %f8,%f2
+ std %f6,[%fp-32]
+ be,pn %icc,.L5
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+
+ b,a .Loop
+
+ .align 16
+C --- LOOP BEGIN
+.Loop: nop
+ nop
+ stx %g2,[%fp+80]
+ fitod %f0,%f4
+C ---
+ nop
+ nop
+ lduw [%i1+8],%g4
+ fitod %f1,%f6
+C ---
+ nop
+ nop
+ ldx [%fp-24],%g2 C p16
+ fanop
+C ---
+ nop
+ nop
+ ldx [%fp-16],%g1 C p0
+ fmuld %f2,%f4,%f4
+C ---
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+C ---
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+72],%f0
+ fanop
+C ---
+ srlx %g1,32,%l0
+ nop
+ stw %g1,[%i0-8]
+ fdtox %f4,%f4
+C ---
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ nop
+ stw %l0,[%i0-4]
+ fdtox %f6,%f6
+C ---
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ std %f4,[%fp-24]
+ fxtod %f8,%f2
+C ---
+ std %f6,[%fp-16]
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+ be,pn %icc,.Lend
+ fanop
+C --- LOOP MIDDLE
+ nop
+ nop
+ stx %g2,[%fp+72]
+ fitod %f0,%f4
+C ---
+ nop
+ nop
+ lduw [%i1+8],%g4
+ fitod %f1,%f6
+C ---
+ nop
+ nop
+ ldx [%fp-40],%g2 C p16
+ fanop
+C ---
+ nop
+ nop
+ ldx [%fp-32],%g1 C p0
+ fmuld %f2,%f4,%f4
+C ---
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+C ---
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%fp+80],%f0
+ fanop
+C ---
+ srlx %g1,32,%l0
+ nop
+ stw %g1,[%i0-8]
+ fdtox %f4,%f4
+C ---
+ sllx %g4,16,%g3 C 0000hhhhllll0000
+ nop
+ stw %l0,[%i0-4]
+ fdtox %f6,%f6
+C ---
+ or %g3,%g4,%g2 C 0000hhhhXXXXllll
+ subcc %i2,1,%i2
+ std %f4,[%fp-40]
+ fxtod %f8,%f2
+C ---
+ std %f6,[%fp-32]
+ andn %g2,%g5,%g2 C 0000hhhh0000llll
+ bne,pt %icc,.Loop
+ fanop
+C --- LOOP END
+
+.L5: add %fp, 80, %l3
+ add %fp, -24, %l4
+ add %fp, 72, %l5
+ b .Ltail
+ add %fp, -40, %l6
+
+.Lend: add %fp, 72, %l3
+ add %fp, -40, %l4
+ add %fp, 80, %l5
+ add %fp, -24, %l6
+.Ltail: stx %g2,[%l3]
+ fitod %f0,%f4
+ fitod %f1,%f6
+ ldx [%l4],%g2 C p16
+ ldx [%l4+8],%g1 C p0
+ fmuld %f2,%f4,%f4
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ add %i1,4,%i1 C s1_ptr++
+ ldd [%l5],%f0
+ srlx %g1,32,%l0
+ stw %g1,[%i0-8]
+ fdtox %f4,%f4
+ stw %l0,[%i0-4]
+.L4: fdtox %f6,%f6
+ std %f4,[%l4]
+ fxtod %f8,%f2
+ std %f6,[%l4+8]
+
+ fitod %f0,%f4
+ fitod %f1,%f6
+ ldx [%l6],%g2 C p16
+ ldx [%l6+8],%g1 C p0
+ fmuld %f2,%f4,%f4
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ ld [%i1],%f9
+ fmuld %f2,%f6,%f6
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ ldd [%l3],%f0
+ srlx %g1,32,%l0
+ stw %g1,[%i0-8]
+ fdtox %f4,%f4
+ stw %l0,[%i0-4]
+.L3: fdtox %f6,%f6
+ std %f4,[%l6]
+ fxtod %f8,%f2
+ std %f6,[%l6+8]
+
+ fitod %f0,%f4
+ fitod %f1,%f6
+ ldx [%l4],%g2 C p16
+ ldx [%l4+8],%g1 C p0
+ fmuld %f2,%f4,%f4
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ fmuld %f2,%f6,%f6
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ srlx %g1,32,%l0
+ stw %g1,[%i0-8]
+ fdtox %f4,%f4
+ stw %l0,[%i0-4]
+.L2: fdtox %f6,%f6
+ std %f4,[%l4]
+ std %f6,[%l4+8]
+
+ ldx [%l6],%g2 C p16
+ ldx [%l6+8],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ srlx %g1,32,%l0
+ stw %g1,[%i0-8]
+ stw %l0,[%i0-4]
+
+.L1: ldx [%l4],%g2 C p16
+ ldx [%l4+8],%g1 C p0
+ sllx %g2,16,%g2 C align p16
+ add %i0,8,%i0 C res_ptr++
+ add %g2,%g1,%g1 C add p16 to p0 (ADD1)
+ srlx %g1,32,%l0
+ stw %g1,[%i0-8]
+ stw %l0,[%i0-4]
+
+ ret
+ restore %g0,%g0,%o0
+
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm
new file mode 100644
index 0000000..636c73b
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm
@@ -0,0 +1,129 @@
+dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers. It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed. That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ lduw [s1p+0],%o4
+ lduw [s2p+0],%o5
+ addcc n,-2,n
+ bl,pn %icc,L(end1)
+ lduw [s1p+4],%g2
+ lduw [s2p+4],%g3
+ be,pn %icc,L(end2)
+ mov 0,cy
+
+ .align 16
+L(loop):
+ sub %o4,%o5,%g4
+ add rp,8,rp
+ lduw [s1p+8],%o4
+ fitod %f0,%f2
+C ---
+ sub %g4,cy,%g4
+ addcc n,-1,n
+ lduw [s2p+8],%o5
+ fitod %f0,%f2
+C ---
+ srlx %g4,63,cy
+ add s2p,8,s2p
+ stw %g4,[rp-8]
+ be,pn %icc,L(exito)+4
+C ---
+ sub %g2,%g3,%g4
+ addcc n,-1,n
+ lduw [s1p+12],%g2
+ fitod %f0,%f2
+C ---
+ sub %g4,cy,%g4
+ add s1p,8,s1p
+ lduw [s2p+4],%g3
+ fitod %f0,%f2
+C ---
+ srlx %g4,63,cy
+ bne,pt %icc,L(loop)
+ stw %g4,[rp-4]
+C ---
+L(exite):
+ sub %o4,%o5,%g4
+ sub %g4,cy,%g4
+ srlx %g4,63,cy
+ stw %g4,[rp+0]
+ sub %g2,%g3,%g4
+ sub %g4,cy,%g4
+ stw %g4,[rp+4]
+ retl
+ srlx %g4,63,%o0
+
+L(exito):
+ sub %g2,%g3,%g4
+ sub %g4,cy,%g4
+ srlx %g4,63,cy
+ stw %g4,[rp-4]
+ sub %o4,%o5,%g4
+ sub %g4,cy,%g4
+ stw %g4,[rp+0]
+ retl
+ srlx %g4,63,%o0
+
+L(end1):
+ sub %o4,%o5,%g4
+ stw %g4,[rp+0]
+ retl
+ srlx %g4,63,%o0
+
+L(end2):
+ sub %o4,%o5,%g4
+ srlx %g4,63,cy
+ stw %g4,[rp+0]
+ sub %g2,%g3,%g4
+ sub %g4,cy,%g4
+ stw %g4,[rp+4]
+ retl
+ srlx %g4,63,%o0
+EPILOGUE(mpn_sub_n)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000..92d0ce7
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,316 @@
+dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces. We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C cycles/limb
+C UltraSPARC 1&2: 6.5
+C UltraSPARC 3: ?
+
+C Possible optimizations:
+C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
+C memory bandwidth limited, this could save 1.5 cycles/limb.
+C 2. Unroll the inner loop. Since we already use alternate temporary areas,
+C it is very straightforward to unroll, using an exit branch midways.
+C Unrolling would allow deeper scheduling which could improve speed for L2
+C cache case.
+C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
+C aren't sufficiently apart-scheduled with just two temp areas.
+C 4. Specialize for particular v values. If its upper 16 bits are zero, we
+C could save many operations.
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ add %sp, -FSIZE, %sp
+ sethi %hi(0xffff), %g1
+ srl %o3, 16, %g2
+ or %g1, %lo(0xffff), %g1
+ and %o3, %g1, %g1
+ stx %g1, [%sp+104]
+ stx %g2, [%sp+112]
+ ldd [%sp+104], %f6
+ ldd [%sp+112], %f8
+ fxtod %f6, %f6
+ fxtod %f8, %f8
+ ld [%sp+104], %f10 C zero f10
+
+ mov 0, %g3 C cy = 0
+
+define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
+
+ add %sp, 160, %o5 C point in scratch area
+ and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
+
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_two_or_more
+ fxtod %f10, %f2
+
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ lduw [%o0], %g5 C read rp[i]
+ b .L1
+ add %o0, -16, %o0
+
+ .align 16
+.L_two_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fmuld %f2, %f8, %f16
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_three_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ std %f12, [%o5+8]
+ lduw [%o0], %g5 C read rp[i]
+ ldx [%o5+16], %g2 C p16
+ ldx [%o5+24], %g1 C p0
+ b .L2
+ add %o0, -12, %o0
+
+ .align 16
+.L_three_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_four_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ std %f12, [%o5+24]
+ lduw [%o0], %g5 C read rp[i]
+ b .L3
+ add %o0, -8, %o0
+
+ .align 16
+.L_four_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ fdtox %f4, %f12
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ bne,pt %icc, .L_five_or_more
+ fxtod %f10, %f2
+
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ lduw [%o0], %g5 C read rp[i]
+ b .L4
+ add %o0, -4, %o0
+
+ .align 16
+.L_five_or_more:
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+ ldx [%o5+16], %g2 C p16
+ fdtox %f4, %f12
+ ldx [%o5+24], %g1 C p0
+ std %f14, [%o5+16]
+ fmuld %f2, %f8, %f16
+ std %f12, [%o5+24]
+ fmuld %f2, %f6, %f4
+ add %o1, 4, %o1 C up++
+ lduw [%o0], %g5 C read rp[i]
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+ b,a .L5
+
+C BEGIN MAIN LOOP
+ .align 16
+C -- 0
+.Loop: sub %g0, %g3, %g3
+ subcc %o2, 1, %o2
+ ld [%o1], %f11 C read up[i]
+ fdtox %f16, %f14
+C -- 1
+ sllx %g2, 16, %g4 C (p16 << 16)
+ add %o0, 4, %o0 C rp++
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+C -- 2
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ fanop
+C -- 3
+ nop
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+C -- 4
+ nop
+ sub %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+C -- 5
+ xor %o5, 16, %o5 C alternate scratch variables
+ add %o1, 4, %o1 C up++
+ stw %g4, [%o0-4]
+ fanop
+C -- 6
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0], %g5 C read rp[i]
+ bne,pt %icc, .Loop
+ fxtod %f10, %f2
+C END MAIN LOOP
+
+.L5: sub %g0, %g3, %g3
+ fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g4, %g3, %g4 C p += cy
+ std %f14, [%o5+0]
+ fmuld %f2, %f8, %f16
+ sub %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ fmuld %f2, %f6, %f4
+ xor %o5, 16, %o5
+ stw %g4, [%o0+0]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+4], %g5 C read rp[i]
+
+ sub %g0, %g3, %g3
+.L4: fdtox %f16, %f14
+ sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ fdtox %f4, %f12
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ std %f14, [%o5+0]
+ sub %g5, %g4, %g4 C p += rp[i]
+ std %f12, [%o5+8]
+ xor %o5, 16, %o5
+ stw %g4, [%o0+4]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+8], %g5 C read rp[i]
+
+ sub %g0, %g3, %g3
+.L3: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ sub %g5, %g4, %g4 C p += rp[i]
+ xor %o5, 16, %o5
+ stw %g4, [%o0+8]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+12], %g5 C read rp[i]
+
+ sub %g0, %g3, %g3
+.L2: sllx %g2, 16, %g4 C (p16 << 16)
+ ldx [%o5+0], %g2 C p16
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ ldx [%o5+8], %g1 C p0
+ add %g3, %g4, %g4 C p += cy
+ sub %g5, %g4, %g4 C p += rp[i]
+ stw %g4, [%o0+12]
+ srlx %g4, 32, %g3 C new cy
+ lduw [%o0+16], %g5 C read rp[i]
+
+ sub %g0, %g3, %g3
+.L1: sllx %g2, 16, %g4 C (p16 << 16)
+ srl %g3, 0, %g3 C zero most significant 32 bits
+ add %g1, %g4, %g4 C p = p0 + (p16 << 16)
+ add %g3, %g4, %g4 C p += cy
+ sub %g5, %g4, %g4 C p += rp[i]
+ stw %g4, [%o0+16]
+ srlx %g4, 32, %g3 C new cy
+
+ sub %g0, %g3, %o0
+ retl
+ sub %sp, -FSIZE, %sp
+EPILOGUE(mpn_submul_1)
diff --git a/gmp-6.3.0/mpn/sparc32/v9/udiv.asm b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm
new file mode 100644
index 0000000..61dde97
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm
@@ -0,0 +1,52 @@
+dnl SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h.
+
+dnl Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr o0
+C n1 o1
+C n0 o2
+C d o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+ sllx %o1, 32, %g1 C shift upper dividend limb
+ srl %o2, 0, %g2 C zero extend lower dividend limb
+ srl %o3, 0, %g3 C zero extend divisor
+ or %g2, %g1, %g1 C assemble 64-bit dividend
+ udivx %g1, %g3, %g1
+ mulx %g1, %g3, %g4
+ sub %g2, %g4, %g2
+ st %g2, [%o0] C store remainder
+ retl
+ mov %g1, %o0 C return quotient
+EPILOGUE(mpn_udiv_qrnnd)