Initial commit.

author: Duncan Wilkie <antigravityd@gmail.com> 2023-11-18 06:11:09 -0600
committer: Duncan Wilkie <antigravityd@gmail.com> 2023-11-18 06:11:09 -0600
commit: 11da511c784eca003deb90c23570f0873954e0de (patch)
tree: e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/sparc64/ultrasparc1234
8 files changed, 2794 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm
new file mode 100644
index 0000000..92374d2
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm
@@ -0,0 +1,241 @@
+dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     4
+C UltraSPARC 3:	      4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u+v+carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+define(`v0', `%l1')
+define(`v1', `%l3')
+define(`v2', `%l5')
+define(`v3', `%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_nc)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	nop
+	b,a	L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_add_n)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	mov	0,cy
+L(com):
+	ldx	[up+0],u0
+	ldx	[vp+0],v0
+	add	up,32,up
+	ldx	[up-24],u1
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	ldx	[up-16],u2
+	ldx	[vp-16],v2
+	ldx	[up-8],u3
+	ldx	[vp-8],v3
+	subcc	n,8,n
+	add	u0,v0,%g1	C main add
+	add	%g1,cy,%g5	C carry add
+	or	u0,v0,%g2
+	bl,pn	%xcc,.Lend4567
+	fanop
+	b,a	.Loop
+
+	.align	16
+C START MAIN LOOP
+.Loop:	andn	%g2,%g5,%g2
+	and	u0,v0,%g3
+	ldx	[up+0],u0
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp+0],v0
+	add	up,32,up
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u1,v1,%g1
+	stx	%g5,[rp+0]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u1,v1,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u1,v1,%g3
+	ldx	[up-24],u1
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u2,v2,%g1
+	stx	%g5,[rp+8]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u2,v2,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u2,v2,%g3
+	ldx	[up-16],u2
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp-16],v2
+	add	rp,32,rp
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u3,v3,%g1
+	stx	%g5,[rp-16]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u3,v3,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u3,v3,%g3
+	ldx	[up-8],u3
+	fanop
+C --
+	or	%g3,%g2,%g2
+	subcc	n,4,n
+	ldx	[vp-8],v3
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u0,v0,%g1
+	stx	%g5,[rp-8]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u0,v0,%g2
+	bge,pt	%xcc,.Loop
+	fanop
+C END MAIN LOOP
+.Lend4567:
+	andn	%g2,%g5,%g2
+	and	u0,v0,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	add	u1,v1,%g1
+	stx	%g5,[rp+0]
+	add	%g1,cy,%g5
+	or	u1,v1,%g2
+	andn	%g2,%g5,%g2
+	and	u1,v1,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	add	u2,v2,%g1
+	stx	%g5,[rp+8]
+	add	%g1,cy,%g5
+	or	u2,v2,%g2
+	andn	%g2,%g5,%g2
+	and	u2,v2,%g3
+	or	%g3,%g2,%g2
+	add	rp,32,rp
+	srlx	%g2,63,cy
+	add	u3,v3,%g1
+	stx	%g5,[rp-16]
+	add	%g1,cy,%g5
+	or	u3,v3,%g2
+	andn	%g2,%g5,%g2
+	and	u3,v3,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	stx	%g5,[rp-8]
+
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+
+.Loop0:	ldx	[up],u0
+	add	up,8,up
+	ldx	[vp],v0
+	add	vp,8,vp
+	add	rp,8,rp
+	subcc	n,1,n
+	add	u0,v0,%g1
+	or	u0,v0,%g2
+	add	%g1,cy,%g5
+	and	u0,v0,%g3
+	andn	%g2,%g5,%g2
+	stx	%g5,[rp-8]
+	or	%g3,%g2,%g2
+	bnz,pt	%xcc,.Loop0
+	srlx	%g2,63,cy
+
+.Lret:	mov	cy,%i0
+	ret
+	restore
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm
new file mode 100644
index 0000000..48a9414
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm
@@ -0,0 +1,606 @@
+dnl  SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1998, 2000-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     14
+C UltraSPARC 3:	      17.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the up operand split
+C into 32-bit pieces.  We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C   0. Rewrite to use algorithm of mpn_addmul_2.
+C   1. Align the stack area where we transfer the four 49-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before up?)
+C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C      develop mpn_addmul_2.  This would save many integer instructions.
+C   3. Unrolling.  Questionable if it is worth the code expansion, given that
+C      it could only save 1 cycle/limb.
+C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
+C      could save many operations, in the FPU (fmuld), but more so in the IEU
+C      since we'll be summing 48-bit quantities, which might be simpler.
+C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
+C      not be greater than needed for L2 cache latency, and also not so great
+C      that i16 needs to be copied.
+C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
+C      ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C    8 FM
+C   10 FA
+C   12 MEM
+C   10 ISHIFT + 14 IADDLOG
+C    1 BRANCH
+C   55 insns totally (plus one mov insn that should be optimized out)
+
+C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain the peak execution rate of 4 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_addmul_1)
+
+C Initialization.  (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+	save	%sp, -256, %sp
+	mov	-1, %g4
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	and	%i3, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%i3, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%i3, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%i3, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	sllx	%i2, 3, %i2
+	mov	0, cy			C clear cy
+	add	%i0, %i2, %i0
+	add	%i1, %i2, %i1
+	neg	%i2
+	add	%i1, 4, %i5
+	add	%i0, -32, %i4
+	add	%i0, -16, %i0
+
+	ldd	[%sp+2223+0], v00
+	ldd	[%sp+2223+8], v16
+	ldd	[%sp+2223+16], v32
+	ldd	[%sp+2223+24], v48
+	ld	[%sp+2223+0],%f2	C zero f2
+	ld	[%sp+2223+0],%f4	C zero f4
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fxtod	v00, v00
+	fxtod	v16, v16
+	fxtod	v32, v32
+	fxtod	v48, v48
+
+C Start real work.  (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fmuld	u00, v00, a00
+	fmuld	u00, v16, a16
+	fmuld	u00, v32, p32
+	fmuld	u32, v00, r32
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_two_or_more
+	fmuld	u32, v16, r48
+
+.L_one:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	add	%i2, 8, %i2
+
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	add	i00, %g5, %g5		C i00+ now in g5
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_1
+	add	%i2, 8, %i2
+
+.L_two_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_three_or_more
+	fmuld	u32, v16, r48
+
+.L_two:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	add	i00, %g5, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_2
+	add	%i2, 8, %i2
+
+.L_three_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_four_or_more
+	fmuld	u32, v16, r48
+
+.L_three:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_3
+	add	%i2, 8, %i2
+
+.L_four_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+
+.L_four:
+	b,a	.L_out_4
+
+C BEGIN MAIN LOOP
+	.align	16
+.Loop:
+C 00
+	srlx	%o4, 16, %o5		C (x >> 16)
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+C 01
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+C 02
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+C 03
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+C 04
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+C 05
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+C 06
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+C 07
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+C 08
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+C 09
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+C 10
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+C 11
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+C 12
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+C 13
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_3:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	r64, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_2:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_1:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	or	%i3, %o5, %o5
+	stx	%o5, [%i4+%i2]
+
+	sllx	i00, 0, %g2
+	add	%g2, cy, cy
+	sllx	i16, 16, %g3
+	add	%g3, cy, cy
+
+	return	%i7+8
+	mov	cy, %o0
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm
new file mode 100644
index 0000000..37674d7
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm
@@ -0,0 +1,551 @@
+dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
+dnl  number and add the result to a n limb vector.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC 1&2:      9
+C UltraSPARC 3:       10
+
+C Algorithm: We use 16 floating-point multiplies per limb product, with the
+C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
+C split into 32-bit pieces.  We sum four 48-bit partial products using
+C floating-point add, then convert the resulting four 50-bit quantities and
+C transfer them to the integer unit.
+
+C Possible optimizations:
+C   1. Align the stack area where we transfer the four 50-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before up?)
+C   2. Perform two of the fp->int conversions with integer instructions.  We
+C      can get almost ten free IEU slots, if we clean up bookkeeping and the
+C      silly carry-limb code.
+C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
+C      code.
+
+C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
+C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
+C FI	= 20
+C L	=  9 x un * vn
+C WDFI	= 10 x vn / 2
+C WD	= 4
+
+C Instruction classification (as per UltraSPARC functional units).
+C Assuming silly carry code is fixed.  Includes bookkeeping.
+C
+C               mpn_addmul_X     mpn_mul_X
+C                1       2       1       2
+C               ==========      ==========
+C      FM        8      16       8      16
+C      FA       10      18      10      18
+C     MEM       12      12      10      10
+C  ISHIFT        6       6       6       6
+C IADDLOG       11      11      10      10
+C  BRANCH        1       1       1       1
+C
+C TOTAL IEU     17      17      16      16
+C TOTAL         48      64      45      61
+C
+C IEU cycles     8.5     8.5     8       8
+C MEM cycles    12      12      10      10
+C ISSUE cycles  12      16      11.25   15.25
+C FPU cycles    10      18      10      18
+C cycles/loop   12      18      12      18
+C cycles/limb   12       9      12       9
+
+
+C INPUT PARAMETERS
+C rp[n + 1]	i0
+C up[n]		i1
+C n		i2
+C vp[2]		i3
+
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+C Combine registers:
+C u00_hi= u32_hi
+C u00_lo= u32_lo
+C a000  = out000
+C a016  = out016
+C Free: f52 f54
+
+
+define(`p000', `%f8')  define(`p016',`%f10')
+define(`p032',`%f12')  define(`p048',`%f14')
+define(`p064',`%f16')  define(`p080',`%f18')
+define(`p096a',`%f20') define(`p112a',`%f22')
+define(`p096b',`%f56') define(`p112b',`%f58')
+
+define(`out000',`%f0') define(`out016',`%f6')
+
+define(`v000',`%f24')  define(`v016',`%f26')
+define(`v032',`%f28')  define(`v048',`%f30')
+define(`v064',`%f44')  define(`v080',`%f46')
+define(`v096',`%f48')  define(`v112',`%f50')
+
+define(`u00',`%f32')   define(`u32', `%f34')
+
+define(`a000',`%f36')  define(`a016',`%f38')
+define(`a032',`%f40')  define(`a048',`%f42')
+define(`a064',`%f60')  define(`a080',`%f62')
+
+define(`u00_hi',`%f2') define(`u32_hi',`%f4')
+define(`u00_lo',`%f3') define(`u32_lo',`%f5')
+
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0')    define(`i16',`%l1')
+define(`r00',`%l2')    define(`r32',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+
+PROLOGUE(mpn_addmul_2)
+
+C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+C This code could be better scheduled.
+
+	save	%sp, -256, %sp
+
+ifdef(`HAVE_VIS',
+`	mov	-1, %g4
+	wr	%g0, 0xD2, %asi
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+	ldda	[%i3+6] %asi, v000
+	ldda	[%i3+4] %asi, v016
+	ldda	[%i3+2] %asi, v032
+	ldda	[%i3+0] %asi, v048
+	fxtod	v000, v000
+	ldda	[%i3+14] %asi, v064
+	fxtod	v016, v016
+	ldda	[%i3+12] %asi, v080
+	fxtod	v032, v032
+	ldda	[%i3+10] %asi, v096
+	fxtod	v048, v048
+	ldda	[%i3+8] %asi, v112
+	fxtod	v064, v064
+	fxtod	v080, v080
+	fxtod	v096, v096
+	fxtod	v112, v112
+	fzero	u00_hi
+	fzero	u32_hi
+',
+`	mov	-1, %g4
+	ldx	[%i3+0], %l0		C vp[0]
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	ldx	[%i3+8], %l1		C vp[1]
+
+	and	%l0, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%l0, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%l0, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%l0, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	and	%l1, xffff, %g2
+	stx	%g2, [%sp+2223+32]
+	srlx	%l1, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+40]
+	srlx	%l1, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+48]
+	srlx	%l1, 48, %g3
+	stx	%g3, [%sp+2223+56]
+
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	ldd	[%sp+2223+0], v000
+	ldd	[%sp+2223+8], v016
+	ldd	[%sp+2223+16], v032
+	ldd	[%sp+2223+24], v048
+	fxtod	v000, v000
+	ldd	[%sp+2223+32], v064
+	fxtod	v016, v016
+	ldd	[%sp+2223+40], v080
+	fxtod	v032, v032
+	ldd	[%sp+2223+48], v096
+	fxtod	v048, v048
+	ldd	[%sp+2223+56], v112
+	fxtod	v064, v064
+	ld	[%sp+2223+0], u00_hi	C zero u00_hi
+	fxtod	v080, v080
+	ld	[%sp+2223+0], u32_hi	C zero u32_hi
+	fxtod	v096, v096
+	fxtod	v112, v112
+')
+C Initialization done.
+	mov	0, %g2
+	mov	0, rlimb
+	mov	0, %g4
+	add	%i0, -8, %i0		C BOOKKEEPING
+
+C Start software pipeline.
+
+	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	fxtod	u00_hi, u00
+C mid
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	fmuld	u00, v000, a000
+	fmuld	u00, v016, a016
+	fmuld	u00, v032, a032
+	fmuld	u00, v048, a048
+	add	%i2, -1, %i2		C BOOKKEEPING
+	fmuld	u00, v064, p064
+	add	%i1, 8, %i1		C BOOKKEEPING
+	fxtod	u32_hi, u32
+	fmuld	u00, v080, p080
+	fmuld	u00, v096, p096a
+	brnz,pt	%i2, .L_2_or_more
+	 fmuld	u00, v112, p112a
+
+.L1:	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	fmovd	p064, a064
+	fmuld	u32, v032, p032
+	fmovd	p080, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	fdtox	a000, out000
+	fdtox	a016, out016
+	faddd	p064, p096a, a064
+	faddd	p080, p112a, a080
+	std	out000, [%sp+2223+0]
+	b	.L_wd2
+	 std	out016, [%sp+2223+8]
+
+.L_2_or_more:
+	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	fmovd	p064, a064
+	fmuld	u32, v032, p032
+	fmovd	p080, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	fdtox	a000, out000
+	fmuld	u00, v000, p000
+	fdtox	a016, out016
+	fmuld	u00, v016, p016
+	faddd	p064, p096a, a064
+	fmuld	u00, v032, p032
+	faddd	p080, p112a, a080
+	fmuld	u00, v048, p048
+	add	%i2, -1, %i2		C BOOKKEEPING
+	std	out000, [%sp+2223+0]
+	faddd	p000, a032, a000
+	fmuld	u00, v064, p064
+	add	%i1, 8, %i1		C BOOKKEEPING
+	std	out016, [%sp+2223+8]
+	fxtod	u32_hi, u32
+	faddd	p016, a048, a016
+	fmuld	u00, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u00, v096, p096a
+	faddd	p048, a080, a048
+	brnz,pt	%i2, .L_3_or_more
+	 fmuld	u00, v112, p112a
+
+	b	.Lend
+	 nop
+
+C  64      32       0
+C   .       .       .
+C   .       |__rXXX_|	32
+C   .      |___cy___|	34
+C   .  |_______i00__|	50
+C  |_______i16__|   .	50
+
+
+C BEGIN MAIN LOOP
+	.align	16
+.L_3_or_more:
+.Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+C
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+C
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	faddd	p064, p096b, a064
+	fmuld	u32, v032, p032
+C
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	faddd	p080, p112b, a080
+	fmuld	u32, v048, p048
+C
+	nop
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+C
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+C
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+C
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+C
+	stw	%l5, [%i0+4]
+	nop
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C midloop
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u00, v000, p000
+C
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u00, v016, p016
+C
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	faddd	p064, p096a, a064
+	fmuld	u00, v032, p032
+C
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	faddd	p080, p112a, a080
+	fmuld	u00, v048, p048
+C
+	add	%i2, -1, %i2		C BOOKKEEPING
+	std	out000, [%sp+2223+0]
+	faddd	p000, a032, a000
+	fmuld	u00, v064, p064
+C
+	add	i00, r32, rlimb
+	add	%i1, 8, %i1		C BOOKKEEPING
+	std	out016, [%sp+2223+8]
+	fxtod	u32_hi, u32
+C
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u00, v080, p080
+C
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u00, v096, p096a
+C
+	stw	%l5, [%i0+0]
+	faddd	p048, a080, a048
+	brnz,pt	%i2, .Loop
+	 fmuld	u00, v112, p112a
+C END MAIN LOOP
+
+C WIND-DOWN PHASE 1
+.Lend:	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	faddd	p064, p096b, a064
+	fmuld	u32, v032, p032
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	faddd	p080, p112b, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	stw	%l5, [%i0+4]
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	faddd	p064, p096a, a064
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	faddd	p080, p112a, a080
+	std	out000, [%sp+2223+0]
+	add	i00, r32, rlimb
+	std	out016, [%sp+2223+8]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+C WIND-DOWN PHASE 2
+.L_wd2:	and	%g2, xffffffff, %g2
+	fdtox	a032, out000
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a048, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	std	out000, [%sp+2223+16]
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+4]
+C mid
+	and	%g2, xffffffff, %g2
+	fdtox	a064, out000
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a080, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	std	out000, [%sp+2223+0]
+	add	i00, r32, rlimb
+	std	out016, [%sp+2223+8]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+C WIND-DOWN PHASE 3
+.L_wd3:	and	%g2, xffffffff, %g2
+	fdtox	p096b, out000
+	add	%g2, rlimb, %l5
+	fdtox	p112b, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], rlimb
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	std	out000, [%sp+2223+16]
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+4]
+C mid
+	and	%g2, xffffffff, %g2
+	add	%g2, rlimb, %l5
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], rlimb
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+	and	%g2, xffffffff, %g2
+	add	%g2, rlimb, %l5
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+
+	sllx	i16, 16, %g2
+	add	i00, cy, cy
+	return	%i7+8
+	add	%g2, cy, %o0
+EPILOGUE(mpn_addmul_2)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm
new file mode 100644
index 0000000..47286d5
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm
@@ -0,0 +1,165 @@
+dnl  SPARC v9 mpn_lshiftc
+
+dnl  Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     3
+C UltraSPARC 3:	      2.67
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`cnt',`%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+
+define(`tnc',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_lshiftc)
+	save	%sp,-160,%sp
+
+	sllx	n,3,%g1
+	sub	%g0,cnt,tnc		C negate shift count
+	add	up,%g1,up		C make %o1 point at end of src
+	add	rp,%g1,rp		C make %o0 point at end of res
+	ldx	[up-8],u3		C load first limb
+	subcc	n,5,n
+	srlx	u3,tnc,%i5		C compute function result
+	bl,pn	%xcc,.Lend1234
+	sllx	u3,cnt,%g3
+
+	subcc	n,4,n
+	ldx	[up-16],u0
+	ldx	[up-24],u1
+	add	up,-32,up
+	ldx	[up-0],u2
+	ldx	[up-8],u3
+	srlx	u0,tnc,%g2
+	bl,pn	%xcc,.Lend5678
+	not	%g3, %g3
+
+	b,a	.Loop
+	ALIGN(16)
+.Loop:
+	sllx	u0,cnt,%g1
+	andn	%g3,%g2,%g3
+	ldx	[up-16],u0
+	fanop
+C --
+	srlx	u1,tnc,%g2
+	subcc	n,4,n
+	stx	%g3,[rp-8]
+	not	%g1, %g1
+C --
+	sllx	u1,cnt,%g3
+	andn	%g1,%g2,%g1
+	ldx	[up-24],u1
+	fanop
+C --
+	srlx	u2,tnc,%g2
+	stx	%g1,[rp-16]
+	add	up,-32,up
+	not	%g3, %g3
+C --
+	sllx	u2,cnt,%g1
+	andn	%g3,%g2,%g3
+	ldx	[up-0],u2
+	fanop
+C --
+	srlx	u3,tnc,%g2
+	stx	%g3,[rp-24]
+	add	rp,-32,rp
+	not	%g1, %g1
+C --
+	sllx	u3,cnt,%g3
+	andn	%g1,%g2,%g1
+	ldx	[up-8],u3
+	fanop
+C --
+	srlx	u0,tnc,%g2
+	stx	%g1,[rp-0]
+	bge,pt	%xcc,.Loop
+	not	%g3, %g3
+C --
+.Lend5678:
+	sllx	u0,cnt,%g1
+	andn	%g3,%g2,%g3
+	srlx	u1,tnc,%g2
+	stx	%g3,[rp-8]
+	not	%g1, %g1
+	sllx	u1,cnt,%g3
+	andn	%g1,%g2,%g1
+	srlx	u2,tnc,%g2
+	stx	%g1,[rp-16]
+	not	%g3, %g3
+	sllx	u2,cnt,%g1
+	andn	%g3,%g2,%g3
+	srlx	u3,tnc,%g2
+	stx	%g3,[rp-24]
+	add	rp,-32,rp
+	not	%g1, %g1
+	sllx	u3,cnt,%g3		C carry...
+	andn	%g1,%g2,%g1
+	stx	%g1,[rp-0]
+
+.Lend1234:
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+.Loop0:
+	add	rp,-8,rp
+	subcc	n,1,n
+	ldx	[up-16],u3
+	add	up,-8,up
+	srlx	u3,tnc,%g2
+	not	%g3, %g3
+	andn	%g3,%g2,%g3
+	stx	%g3,[rp]
+	sllx	u3,cnt,%g3
+	bnz,pt	%xcc,.Loop0
+	fanop
+.Lret:
+	not	%g3, %g3
+	stx	%g3,[rp-8]
+	mov	%i5,%i0
+	ret
+	restore
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm
new file mode 100644
index 0000000..871d562
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm
@@ -0,0 +1,580 @@
+dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     14
+C UltraSPARC 3:	      18.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the s1 operand split
+C into 32-bit pieces.  We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C   1. Align the stack area where we transfer the four 49-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before s1?)
+C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C      develop mpn_addmul_2.  This would save many integer instructions.
+C   3. Unrolling.  Questionable if it is worth the code expansion, given that
+C      it could only save 1 cycle/limb.
+C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
+C      could save many operations, in the FPU (fmuld), but more so in the IEU
+C      since we'll be summing 48-bit quantities, which might be simpler.
+C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
+C      not be greater than needed for L2 cache latency, and also not so great
+C      that i16 needs to be copied.
+C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
+C      ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C    8 FM
+C   10 FA
+C   11 MEM
+C   9 ISHIFT + 10? IADDLOG
+C    1 BRANCH
+C   49 insns totally (plus three mov insns that should be optimized out)
+
+C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain 3.79 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_mul_1)
+
+C Initialization.  (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+	save	%sp, -256, %sp
+	mov	-1, %g4
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	and	%i3, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%i3, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%i3, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%i3, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	sllx	%i2, 3, %i2
+	mov	0, cy			C clear cy
+	add	%i0, %i2, %i0
+	add	%i1, %i2, %i1
+	neg	%i2
+	add	%i1, 4, %i5
+	add	%i0, -32, %i4
+	add	%i0, -16, %i0
+
+	ldd	[%sp+2223+0], v00
+	ldd	[%sp+2223+8], v16
+	ldd	[%sp+2223+16], v32
+	ldd	[%sp+2223+24], v48
+	ld	[%sp+2223+0],%f2	C zero f2
+	ld	[%sp+2223+0],%f4	C zero f4
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fxtod	v00, v00
+	fxtod	v16, v16
+	fxtod	v32, v32
+	fxtod	v48, v48
+
+C Start real work.  (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fmuld	u00, v00, a00
+	fmuld	u00, v16, a16
+	fmuld	u00, v32, p32
+	fmuld	u32, v00, r32
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_two_or_more
+	fmuld	u32, v16, r48
+
+.L_one:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	add	%i2, 8, %i2
+
+	mov	i00, %g5		C i00+ now in g5
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_1
+	add	%i2, 8, %i2
+
+.L_two_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_three_or_more
+	fmuld	u32, v16, r48
+
+.L_two:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	mov	i00, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_2
+	add	%i2, 8, %i2
+
+.L_three_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_four_or_more
+	fmuld	u32, v16, r48
+
+.L_three:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_3
+	add	%i2, 8, %i2
+
+.L_four_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+
+.L_four:
+	b,a	.L_out_4
+
+C BEGIN MAIN LOOP
+	.align	16
+.Loop:
+C 00
+	srlx	%o4, 16, %o5		C (x >> 16)
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+C 01
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+C 02
+	faddd	p48, r48, a48
+C 03
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+C 04
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+C 05
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+C 06
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+C 07
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+C 08
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+C 09
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+C 10
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+C 11
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+C 12
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+C 13
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_3:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	r64, a00
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_2:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_1:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	or	%i3, %o5, %o5
+	stx	%o5, [%i4+%i2]
+
+	sllx	i00, 0, %g2
+	add	%g2, cy, cy
+	sllx	i16, 16, %g3
+	add	%g3, cy, cy
+
+	return	%i7+8
+	mov	cy, %o0
+EPILOGUE(mpn_mul_1)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
new file mode 100644
index 0000000..43c69d3
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
@@ -0,0 +1,342 @@
+dnl  SPARC v9 64-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     22
+C UltraSPARC 3:	      36
+
+C This was generated by the Sun C compiler.  It runs at 22 cycles/limb on the
+C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal
+C code using the same algorithm.  For 1-3 limbs, a special loop was generated,
+C which causes performance problems in particular for 2 and 3 limbs.
+C Ultimately, this should be replaced by hand-written code in the same software
+C pipeline style as e.g., addmul_1.asm.
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sqr_diagonal)
+	save	%sp, -240, %sp
+
+	sethi	%hi(0x1ffc00), %o0
+	sethi	%hi(0x3ffc00), %o1
+	add	%o0, 1023, %o7
+	cmp	%i2, 4
+	add	%o1, 1023, %o4
+	or	%g0, %i1, %g1
+	or	%g0, %i0, %o0
+	bl,pn	%xcc, .Lsmall
+	or	%g0, 0, %g2
+
+	ldx	[%i1], %o1
+	add	%i1, 24, %g1
+	or	%g0, 3, %g2
+	srlx	%o1, 42, %g3
+	stx	%g3, [%sp+2279]
+	and	%o1, %o7, %o2
+	stx	%o2, [%sp+2263]
+	srlx	%o1, 21, %o1
+	ldd	[%sp+2279], %f0
+	and	%o1, %o7, %o1
+	stx	%o1, [%sp+2271]
+	ldx	[%i1+8], %o2
+	fxtod	%f0, %f12
+	srlx	%o2, 21, %o1
+	and	%o2, %o7, %g3
+	ldd	[%sp+2263], %f2
+	fmuld	%f12, %f12, %f10
+	srlx	%o2, 42, %o2
+	ldd	[%sp+2271], %f0
+	and	%o1, %o7, %o1
+	fxtod	%f2, %f8
+	stx	%o2, [%sp+2279]
+	stx	%o1, [%sp+2271]
+	fxtod	%f0, %f0
+	stx	%g3, [%sp+2263]
+	fdtox	%f10, %f14
+	fmuld	%f12, %f8, %f6
+	ldx	[%i1+16], %o2
+	std	%f14, [%sp+2255]
+	fmuld	%f0, %f0, %f2
+	fmuld	%f8, %f8, %f10
+	srlx	%o2, 42, %o1
+	faddd	%f6, %f6, %f6
+	fmuld	%f12, %f0, %f12
+	fmuld	%f0, %f8, %f8
+	ldd	[%sp+2279], %f0
+	ldd	[%sp+2263], %f4
+	fdtox	%f10, %f10
+	std	%f10, [%sp+2239]
+	faddd	%f2, %f6, %f6
+	ldd	[%sp+2271], %f2
+	fdtox	%f12, %f12
+	std	%f12, [%sp+2247]
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2223]
+
+.Loop:	srlx	%o2, 21, %g3
+	stx	%o1, [%sp+2279]
+	add	%g2, 1, %g2
+	and	%g3, %o7, %o1
+	ldx	[%sp+2255], %g4
+	cmp	%g2, %i2
+	stx	%o1, [%sp+2271]
+	add	%g1, 8, %g1
+	add	%o0, 16, %o0
+	ldx	[%sp+2239], %o1
+	fxtod	%f0, %f10
+	fxtod	%f4, %f14
+	ldx	[%sp+2231], %i0
+	ldx	[%sp+2223], %g5
+	ldx	[%sp+2247], %g3
+	and	%o2, %o7, %o2
+	fxtod	%f2, %f8
+	fmuld	%f10, %f10, %f0
+	stx	%o2, [%sp+2263]
+	fmuld	%f10, %f14, %f6
+	ldx	[%g1-8], %o2
+	fmuld	%f10, %f8, %f12
+	fdtox	%f0, %f2
+	ldd	[%sp+2279], %f0
+	fmuld	%f8, %f8, %f4
+	faddd	%f6, %f6, %f6
+	fmuld	%f14, %f14, %f10
+	std	%f2, [%sp+2255]
+	sllx	%g4, 20, %g4
+	ldd	[%sp+2271], %f2
+	fmuld	%f8, %f14, %f8
+	sllx	%i0, 22, %i1
+	fdtox	%f12, %f12
+	std	%f12, [%sp+2247]
+	sllx	%g5, 42, %i0
+	add	%o1, %i1, %o1
+	faddd	%f4, %f6, %f6
+	ldd	[%sp+2263], %f4
+	add	%o1, %i0, %o1
+	add	%g3, %g4, %g3
+	fdtox	%f10, %f10
+	std	%f10, [%sp+2239]
+	srlx	%o1, 42, %g4
+	and	%g5, %o4, %i0
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	srlx	%g5, 22, %g5
+	sub	%g4, %i0, %g4
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2223]
+	srlx	%g4, 63, %g4
+	add	%g3, %g5, %g3
+	add	%g3, %g4, %g3
+	stx	%o1, [%o0-16]
+	srlx	%o2, 42, %o1
+	bl,pt	%xcc, .Loop
+	stx	%g3, [%o0-8]
+
+	stx	%o1, [%sp+2279]
+	srlx	%o2, 21, %o1
+	fxtod	%f0, %f16
+	ldx	[%sp+2223], %g3
+	fxtod	%f4, %f6
+	and	%o2, %o7, %o3
+	stx	%o3, [%sp+2263]
+	fxtod	%f2, %f4
+	and	%o1, %o7, %o1
+	ldx	[%sp+2231], %o2
+	sllx	%g3, 42, %g4
+	fmuld	%f16, %f16, %f14
+	stx	%o1, [%sp+2271]
+	fmuld	%f16, %f6, %f8
+	add	%o0, 48, %o0
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	fmuld	%f4, %f4, %f10
+	ldx	[%sp+2255], %o3
+	fdtox	%f14, %f14
+	fmuld	%f4, %f6, %f2
+	std	%f14, [%sp+2255]
+	faddd	%f8, %f8, %f12
+	add	%o1, %o2, %o2
+	fmuld	%f16, %f4, %f4
+	ldd	[%sp+2279], %f0
+	sllx	%o3, 20, %g5
+	add	%o2, %g4, %o2
+	fmuld	%f6, %f6, %f6
+	srlx	%o2, 42, %o3
+	and	%g3, %o4, %g4
+	srlx	%g3, 22, %g3
+	faddd	%f10, %f12, %f16
+	ldd	[%sp+2271], %f12
+	ldd	[%sp+2263], %f8
+	fxtod	%f0, %f0
+	sub	%o3, %g4, %o3
+	ldx	[%sp+2247], %o1
+	srlx	%o3, 63, %o3
+	fdtox	%f2, %f10
+	fxtod	%f8, %f8
+	std	%f10, [%sp+2231]
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2239]
+	add	%o1, %g5, %o1
+	fmuld	%f0, %f0, %f2
+	fdtox	%f16, %f16
+	std	%f16, [%sp+2223]
+	add	%o1, %g3, %o1
+	fdtox	%f4, %f4
+	std	%f4, [%sp+2247]
+	fmuld	%f0, %f8, %f10
+	fxtod	%f12, %f12
+	add	%o1, %o3, %o1
+	stx	%o2, [%o0-48]
+	fmuld	%f8, %f8, %f6
+	stx	%o1, [%o0-40]
+	fdtox	%f2, %f2
+	ldx	[%sp+2231], %o2
+	faddd	%f10, %f10, %f10
+	ldx	[%sp+2223], %g3
+	fmuld	%f12, %f12, %f4
+	fdtox	%f6, %f6
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	fmuld	%f12, %f8, %f8
+	sllx	%g3, 42, %g5
+	ldx	[%sp+2255], %o3
+	fmuld	%f0, %f12, %f0
+	add	%o1, %o2, %o2
+	faddd	%f4, %f10, %f4
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	and	%g3, %o4, %g4
+	fdtox	%f8, %f8
+	sllx	%o3, 20, %g5
+	std	%f8, [%sp+2231]
+	fdtox	%f0, %f0
+	srlx	%o2, 42, %o3
+	add	%o1, %g5, %o1
+	fdtox	%f4, %f4
+	srlx	%g3, 22, %g3
+	sub	%o3, %g4, %o3
+	std	%f6, [%sp+2239]
+	std	%f4, [%sp+2223]
+	srlx	%o3, 63, %o3
+	add	%o1, %g3, %o1
+	std	%f2, [%sp+2255]
+	add	%o1, %o3, %o1
+	std	%f0, [%sp+2247]
+	stx	%o2, [%o0-32]
+	stx	%o1, [%o0-24]
+	ldx	[%sp+2231], %o2
+	ldx	[%sp+2223], %o3
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	sllx	%o3, 42, %g5
+	ldx	[%sp+2255], %g4
+	and	%o3, %o4, %g3
+	add	%o1, %o2, %o2
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	stx	%o2, [%o0-16]
+	sllx	%g4, 20, %g4
+	srlx	%o2, 42, %o2
+	add	%o1, %g4, %o1
+	srlx	%o3, 22, %o3
+	sub	%o2, %g3, %o2
+	srlx	%o2, 63, %o2
+	add	%o1, %o3, %o1
+	add	%o1, %o2, %o1
+	stx	%o1, [%o0-8]
+	ret
+	restore	%g0, %g0, %g0
+.Lsmall:
+	ldx	[%g1], %o2
+.Loop0:
+	and	%o2, %o7, %o1
+	stx	%o1, [%sp+2263]
+	add	%g2, 1, %g2
+	srlx	%o2, 21, %o1
+	add	%g1, 8, %g1
+	srlx	%o2, 42, %o2
+	stx	%o2, [%sp+2279]
+	and	%o1, %o7, %o1
+	ldd	[%sp+2263], %f0
+	cmp	%g2, %i2
+	stx	%o1, [%sp+2271]
+	fxtod	%f0, %f6
+	ldd	[%sp+2279], %f0
+	ldd	[%sp+2271], %f4
+	fxtod	%f0, %f2
+	fmuld	%f6, %f6, %f0
+	fxtod	%f4, %f10
+	fmuld	%f2, %f6, %f4
+	fdtox	%f0, %f0
+	std	%f0, [%sp+2239]
+	fmuld	%f10, %f6, %f8
+	fmuld	%f10, %f10, %f0
+	faddd	%f4, %f4, %f6
+	fmuld	%f2, %f2, %f4
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	fmuld	%f2, %f10, %f2
+	faddd	%f0, %f6, %f0
+	fdtox	%f4, %f4
+	std	%f4, [%sp+2255]
+	fdtox	%f2, %f2
+	std	%f2, [%sp+2247]
+	fdtox	%f0, %f0
+	std	%f0, [%sp+2223]
+	ldx	[%sp+2239], %o1
+	ldx	[%sp+2255], %g4
+	ldx	[%sp+2231], %o2
+	sllx	%g4, 20, %g4
+	ldx	[%sp+2223], %o3
+	sllx	%o2, 22, %o2
+	sllx	%o3, 42, %g5
+	add	%o1, %o2, %o2
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	stx	%o2, [%o0]
+	and	%o3, %o4, %g3
+	srlx	%o2, 42, %o2
+	add	%o1, %g4, %o1
+	srlx	%o3, 22, %o3
+	sub	%o2, %g3, %o2
+	srlx	%o2, 63, %o2
+	add	%o1, %o3, %o1
+	add	%o1, %o2, %o1
+	stx	%o1, [%o0+8]
+	add	%o0, 16, %o0
+	bl,a,pt	%xcc, .Loop0
+	ldx	[%g1], %o2
+	ret
+	restore	%g0, %g0, %g0
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm
new file mode 100644
index 0000000..9fb7f70
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm
@@ -0,0 +1,241 @@
+dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     4
+C UltraSPARC 3:	      4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u-v-carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`vp',`%i2')
+define(`n',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+define(`v0',`%l1')
+define(`v1',`%l3')
+define(`v2',`%l5')
+define(`v3',`%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sub_nc)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	nop
+	b,a	L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_sub_n)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	mov	0,cy
+L(com):
+	ldx	[up+0],u0
+	ldx	[vp+0],v0
+	add	up,32,up
+	ldx	[up-24],u1
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	ldx	[up-16],u2
+	ldx	[vp-16],v2
+	ldx	[up-8],u3
+	ldx	[vp-8],v3
+	subcc	n,8,n
+	sub	u0,v0,%g1	C main sub
+	sub	%g1,cy,%g5	C carry sub
+	orn	u0,v0,%g2
+	bl,pn	%xcc,.Lend4567
+	fanop
+	b,a	.Loop
+
+	.align	16
+C START MAIN LOOP
+.Loop:	orn	%g5,%g2,%g2
+	andn	u0,v0,%g3
+	ldx	[up+0],u0
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp+0],v0
+	add	up,32,up
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u1,v1,%g1
+	stx	%g5,[rp+0]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u1,v1,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u1,v1,%g3
+	ldx	[up-24],u1
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u2,v2,%g1
+	stx	%g5,[rp+8]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u2,v2,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u2,v2,%g3
+	ldx	[up-16],u2
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp-16],v2
+	add	rp,32,rp
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u3,v3,%g1
+	stx	%g5,[rp-16]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u3,v3,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u3,v3,%g3
+	ldx	[up-8],u3
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	subcc	n,4,n
+	ldx	[vp-8],v3
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u0,v0,%g1
+	stx	%g5,[rp-8]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u0,v0,%g2
+	bge,pt	%xcc,.Loop
+	fanop
+C END MAIN LOOP
+.Lend4567:
+	orn	%g5,%g2,%g2
+	andn	u0,v0,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	sub	u1,v1,%g1
+	stx	%g5,[rp+0]
+	sub	%g1,cy,%g5
+	orn	u1,v1,%g2
+	orn	%g5,%g2,%g2
+	andn	u1,v1,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	sub	u2,v2,%g1
+	stx	%g5,[rp+8]
+	sub	%g1,cy,%g5
+	orn	u2,v2,%g2
+	orn	%g5,%g2,%g2
+	andn	u2,v2,%g3
+	andn	%g2,%g3,%g2
+	add	rp,32,rp
+	srlx	%g2,63,cy
+	sub	u3,v3,%g1
+	stx	%g5,[rp-16]
+	sub	%g1,cy,%g5
+	orn	u3,v3,%g2
+	orn	%g5,%g2,%g2
+	andn	u3,v3,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	stx	%g5,[rp-8]
+
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+
+.Loop0:	ldx	[up],u0
+	add	up,8,up
+	ldx	[vp],v0
+	add	vp,8,vp
+	add	rp,8,rp
+	subcc	n,1,n
+	sub	u0,v0,%g1
+	orn	u0,v0,%g2
+	sub	%g1,cy,%g5
+	andn	u0,v0,%g3
+	orn	%g5,%g2,%g2
+	stx	%g5,[rp-8]
+	andn	%g2,%g3,%g2
+	bnz,pt	%xcc,.Loop0
+	srlx	%g2,63,cy
+
+.Lret:	mov	cy,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm
new file mode 100644
index 0000000..0bdb566
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm
@@ -0,0 +1,68 @@
+dnl  SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     18
+C UltraSPARC 3:	      23
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-176,%sp
+
+	sllx	%i2, 3, %g2
+	or	%g0, %i1, %o1
+	add	%g2, 15, %o0
+	or	%g0, %i2, %o2
+	and	%o0, -16, %o0
+	sub	%sp, %o0, %sp
+	add	%sp, 2223, %o0
+	or	%g0, %o0, %l0
+	call	mpn_mul_1
+	or	%g0, %i3, %o3
+	or	%g0, %o0, %l1		C preserve carry value from mpn_mul_1
+	or	%g0, %i0, %o0
+	or	%g0, %i0, %o1
+	or	%g0, %l0, %o2
+	call	mpn_sub_n
+	or	%g0, %i2, %o3
+	ret
+	restore	%l1, %o0, %o0		C sum carry values
+EPILOGUE(mpn_submul_1)
author	Duncan Wilkie <antigravityd@gmail.com>	2023-11-18 06:11:09 -0600
committer	Duncan Wilkie <antigravityd@gmail.com>	2023-11-18 06:11:09 -0600
commit	11da511c784eca003deb90c23570f0873954e0de (patch)
tree	e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/sparc64/ultrasparc1234