aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/sparc64/ultrasparc1234
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/sparc64/ultrasparc1234
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/sparc64/ultrasparc1234')
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm241
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm606
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm551
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm165
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm580
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm342
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm241
-rw-r--r--gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm68
8 files changed, 2794 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm
new file mode 100644
index 0000000..92374d2
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm
@@ -0,0 +1,241 @@
+dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 4
+C UltraSPARC 3: 4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u+v+carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n', `%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+define(`v0', `%l1')
+define(`v1', `%l3')
+define(`v2', `%l5')
+define(`v3', `%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_nc)
+ save %sp,-160,%sp
+
+ fitod %f0,%f0 C make sure f0 contains small, quiet number
+ subcc n,4,%g0
+ bl,pn %xcc,.Loop0
+ nop
+ b,a L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_add_n)
+ save %sp,-160,%sp
+
+ fitod %f0,%f0 C make sure f0 contains small, quiet number
+ subcc n,4,%g0
+ bl,pn %xcc,.Loop0
+ mov 0,cy
+L(com):
+ ldx [up+0],u0
+ ldx [vp+0],v0
+ add up,32,up
+ ldx [up-24],u1
+ ldx [vp+8],v1
+ add vp,32,vp
+ ldx [up-16],u2
+ ldx [vp-16],v2
+ ldx [up-8],u3
+ ldx [vp-8],v3
+ subcc n,8,n
+ add u0,v0,%g1 C main add
+ add %g1,cy,%g5 C carry add
+ or u0,v0,%g2
+ bl,pn %xcc,.Lend4567
+ fanop
+ b,a .Loop
+
+ .align 16
+C START MAIN LOOP
+.Loop: andn %g2,%g5,%g2
+ and u0,v0,%g3
+ ldx [up+0],u0
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp+0],v0
+ add up,32,up
+ fanop
+C --
+ srlx %g2,63,cy
+ add u1,v1,%g1
+ stx %g5,[rp+0]
+ fanop
+C --
+ add %g1,cy,%g5
+ or u1,v1,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g5,%g2
+ and u1,v1,%g3
+ ldx [up-24],u1
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp+8],v1
+ add vp,32,vp
+ fanop
+C --
+ srlx %g2,63,cy
+ add u2,v2,%g1
+ stx %g5,[rp+8]
+ fanop
+C --
+ add %g1,cy,%g5
+ or u2,v2,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g5,%g2
+ and u2,v2,%g3
+ ldx [up-16],u2
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp-16],v2
+ add rp,32,rp
+ fanop
+C --
+ srlx %g2,63,cy
+ add u3,v3,%g1
+ stx %g5,[rp-16]
+ fanop
+C --
+ add %g1,cy,%g5
+ or u3,v3,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g5,%g2
+ and u3,v3,%g3
+ ldx [up-8],u3
+ fanop
+C --
+ or %g3,%g2,%g2
+ subcc n,4,n
+ ldx [vp-8],v3
+ fanop
+C --
+ srlx %g2,63,cy
+ add u0,v0,%g1
+ stx %g5,[rp-8]
+ fanop
+C --
+ add %g1,cy,%g5
+ or u0,v0,%g2
+ bge,pt %xcc,.Loop
+ fanop
+C END MAIN LOOP
+.Lend4567:
+ andn %g2,%g5,%g2
+ and u0,v0,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ add u1,v1,%g1
+ stx %g5,[rp+0]
+ add %g1,cy,%g5
+ or u1,v1,%g2
+ andn %g2,%g5,%g2
+ and u1,v1,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ add u2,v2,%g1
+ stx %g5,[rp+8]
+ add %g1,cy,%g5
+ or u2,v2,%g2
+ andn %g2,%g5,%g2
+ and u2,v2,%g3
+ or %g3,%g2,%g2
+ add rp,32,rp
+ srlx %g2,63,cy
+ add u3,v3,%g1
+ stx %g5,[rp-16]
+ add %g1,cy,%g5
+ or u3,v3,%g2
+ andn %g2,%g5,%g2
+ and u3,v3,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ stx %g5,[rp-8]
+
+ addcc n,4,n
+ bz,pn %xcc,.Lret
+ fanop
+
+.Loop0: ldx [up],u0
+ add up,8,up
+ ldx [vp],v0
+ add vp,8,vp
+ add rp,8,rp
+ subcc n,1,n
+ add u0,v0,%g1
+ or u0,v0,%g2
+ add %g1,cy,%g5
+ and u0,v0,%g3
+ andn %g2,%g5,%g2
+ stx %g5,[rp-8]
+ or %g3,%g2,%g2
+ bnz,pt %xcc,.Loop0
+ srlx %g2,63,cy
+
+.Lret: mov cy,%i0
+ ret
+ restore
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm
new file mode 100644
index 0000000..48a9414
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm
@@ -0,0 +1,606 @@
+dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 14
+C UltraSPARC 3: 17.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the up operand split
+C into 32-bit pieces. We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C 0. Rewrite to use algorithm of mpn_addmul_2.
+C 1. Align the stack area where we transfer the four 49-bit product-sums
+C to a 32-byte boundary. That would minimize the cache collision.
+C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
+C be to align the area to map to the area immediately before up?)
+C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C develop mpn_addmul_2. This would save many integer instructions.
+C 3. Unrolling. Questionable if it is worth the code expansion, given that
+C it could only save 1 cycle/limb.
+C 4. Specialize for particular v values. If its upper 32 bits are zero, we
+C could save many operations, in the FPU (fmuld), but more so in the IEU
+C since we'll be summing 48-bit quantities, which might be simpler.
+C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
+C not be greater than needed for L2 cache latency, and also not so great
+C that i16 needs to be copied.
+C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
+C ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C 8 FM
+C 10 FA
+C 12 MEM
+C 10 ISHIFT + 14 IADDLOG
+C 1 BRANCH
+C 55 insns totally (plus one mov insn that should be optimized out)
+
+C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain the peak execution rate of 4 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_addmul_1)
+
+C Initialization. (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
+C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+ save %sp, -256, %sp
+ mov -1, %g4
+ srlx %g4, 48, xffff C store mask in register `xffff'
+ and %i3, xffff, %g2
+ stx %g2, [%sp+2223+0]
+ srlx %i3, 16, %g3
+ and %g3, xffff, %g3
+ stx %g3, [%sp+2223+8]
+ srlx %i3, 32, %g2
+ and %g2, xffff, %g2
+ stx %g2, [%sp+2223+16]
+ srlx %i3, 48, %g3
+ stx %g3, [%sp+2223+24]
+ srlx %g4, 32, xffffffff C store mask in register `xffffffff'
+
+ sllx %i2, 3, %i2
+ mov 0, cy C clear cy
+ add %i0, %i2, %i0
+ add %i1, %i2, %i1
+ neg %i2
+ add %i1, 4, %i5
+ add %i0, -32, %i4
+ add %i0, -16, %i0
+
+ ldd [%sp+2223+0], v00
+ ldd [%sp+2223+8], v16
+ ldd [%sp+2223+16], v32
+ ldd [%sp+2223+24], v48
+ ld [%sp+2223+0],%f2 C zero f2
+ ld [%sp+2223+0],%f4 C zero f4
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fxtod v00, v00
+ fxtod v16, v16
+ fxtod v32, v32
+ fxtod v48, v48
+
+C Start real work. (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+ fxtod %f2, u00
+ fxtod %f4, u32
+ fmuld u00, v00, a00
+ fmuld u00, v16, a16
+ fmuld u00, v32, p32
+ fmuld u32, v00, r32
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_two_or_more
+ fmuld u32, v16, r48
+
+.L_one:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ fdtox a32, a32
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ std a32, [%sp+2223+16]
+ std a48, [%sp+2223+24]
+ add %i2, 8, %i2
+
+ fdtox r64, a00
+ ldx [%i0+%i2], rlimb C read rp[i]
+ fdtox r80, a16
+ ldx [%sp+2223+0], i00
+ ldx [%sp+2223+8], i16
+ ldx [%sp+2223+16], i32
+ ldx [%sp+2223+24], i48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ add %i2, 8, %i2
+
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ add i00, %g5, %g5 C i00+ now in g5
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ sllx i48, 32, %l6 C (i48 << 32)
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_1
+ add %i2, 8, %i2
+
+.L_two_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ fdtox a32, a32
+ fxtod %f2, u00
+ fxtod %f4, u32
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_three_or_more
+ fmuld u32, v16, r48
+
+.L_two:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ ldx [%sp+2223+8], i16
+ ldx [%sp+2223+16], i32
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ std a32, [%sp+2223+16]
+ std a48, [%sp+2223+24]
+ add %i2, 8, %i2
+
+ fdtox r64, a00
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ add i00, %g5, %g5 C i00+ now in g5
+ fdtox r80, a16
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_2
+ add %i2, 8, %i2
+
+.L_three_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_four_or_more
+ fmuld u32, v16, r48
+
+.L_three:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+ add i00, %g5, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_3
+ add %i2, 8, %i2
+
+.L_four_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+ add i00, %g5, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .Loop
+ fmuld u32, v16, r48
+
+.L_four:
+ b,a .L_out_4
+
+C BEGIN MAIN LOOP
+ .align 16
+.Loop:
+C 00
+ srlx %o4, 16, %o5 C (x >> 16)
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+C 01
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+C 02
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+C 03
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ add i00, %g5, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+C 04
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+C 05
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+C 06
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+C 07
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+C 08
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+C 09
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+C 10
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+C 11
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+C 12
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+C 13
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .Loop
+ fmuld u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+ srlx %o4, 16, %o5 C (x >> 16)
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ fdtox a00, a00
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ faddd p48, r48, a48
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ add i00, %g5, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_3:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ fdtox r64, a00
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ ldx [%i0+%i2], rlimb C read rp[i]
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ add i00, %g5, %g5 C i00+ now in g5
+ fdtox r80, a16
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_2:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ srlx rlimb, 32, %g4 C HI(rlimb)
+ and rlimb, xffffffff, %g5 C LO(rlimb)
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ add i00, %g5, %g5 C i00+ now in g5
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ add i32, %g4, %g4 C i32+ now in g4
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_1:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ or %i3, %o5, %o5
+ stx %o5, [%i4+%i2]
+
+ sllx i00, 0, %g2
+ add %g2, cy, cy
+ sllx i16, 16, %g3
+ add %g3, cy, cy
+
+ return %i7+8
+ mov cy, %o0
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm
new file mode 100644
index 0000000..37674d7
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm
@@ -0,0 +1,551 @@
+dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
+dnl number and add the result to a n limb vector.
+
+dnl Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 9
+C UltraSPARC 3: 10
+
+C Algorithm: We use 16 floating-point multiplies per limb product, with the
+C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
+C split into 32-bit pieces. We sum four 48-bit partial products using
+C floating-point add, then convert the resulting four 50-bit quantities and
+C transfer them to the integer unit.
+
+C Possible optimizations:
+C 1. Align the stack area where we transfer the four 50-bit product-sums
+C to a 32-byte boundary. That would minimize the cache collision.
+C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
+C be to align the area to map to the area immediately before up?)
+C 2. Perform two of the fp->int conversions with integer instructions. We
+C can get almost ten free IEU slots, if we clean up bookkeeping and the
+C silly carry-limb code.
+C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
+C code.
+
+C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
+C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
+C FI = 20
+C L = 9 x un * vn
+C WDFI = 10 x vn / 2
+C WD = 4
+
+C Instruction classification (as per UltraSPARC functional units).
+C Assuming silly carry code is fixed. Includes bookkeeping.
+C
+C mpn_addmul_X mpn_mul_X
+C 1 2 1 2
+C ========== ==========
+C FM 8 16 8 16
+C FA 10 18 10 18
+C MEM 12 12 10 10
+C ISHIFT 6 6 6 6
+C IADDLOG 11 11 10 10
+C BRANCH 1 1 1 1
+C
+C TOTAL IEU 17 17 16 16
+C TOTAL 48 64 45 61
+C
+C IEU cycles 8.5 8.5 8 8
+C MEM cycles 12 12 10 10
+C ISSUE cycles 12 16 11.25 15.25
+C FPU cycles 10 18 10 18
+C cycles/loop 12 18 12 18
+C cycles/limb 12 9 12 9
+
+
+C INPUT PARAMETERS
+C rp[n + 1] i0
+C up[n] i1
+C n i2
+C vp[2] i3
+
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+
+C Combine registers:
+C u00_hi= u32_hi
+C u00_lo= u32_lo
+C a000 = out000
+C a016 = out016
+C Free: f52 f54
+
+
+define(`p000', `%f8') define(`p016',`%f10')
+define(`p032',`%f12') define(`p048',`%f14')
+define(`p064',`%f16') define(`p080',`%f18')
+define(`p096a',`%f20') define(`p112a',`%f22')
+define(`p096b',`%f56') define(`p112b',`%f58')
+
+define(`out000',`%f0') define(`out016',`%f6')
+
+define(`v000',`%f24') define(`v016',`%f26')
+define(`v032',`%f28') define(`v048',`%f30')
+define(`v064',`%f44') define(`v080',`%f46')
+define(`v096',`%f48') define(`v112',`%f50')
+
+define(`u00',`%f32') define(`u32', `%f34')
+
+define(`a000',`%f36') define(`a016',`%f38')
+define(`a032',`%f40') define(`a048',`%f42')
+define(`a064',`%f60') define(`a080',`%f62')
+
+define(`u00_hi',`%f2') define(`u32_hi',`%f4')
+define(`u00_lo',`%f3') define(`u32_lo',`%f5')
+
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1')
+define(`r00',`%l2') define(`r32',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+
+PROLOGUE(mpn_addmul_2)
+
+C Initialization. (1) Split v operand into eight 16-bit chunks and store them
+C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
+C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+C This code could be better scheduled.
+
+ save %sp, -256, %sp
+
+ifdef(`HAVE_VIS',
+` mov -1, %g4
+ wr %g0, 0xD2, %asi
+ srlx %g4, 32, xffffffff C store mask in register `xffffffff'
+ ldda [%i3+6] %asi, v000
+ ldda [%i3+4] %asi, v016
+ ldda [%i3+2] %asi, v032
+ ldda [%i3+0] %asi, v048
+ fxtod v000, v000
+ ldda [%i3+14] %asi, v064
+ fxtod v016, v016
+ ldda [%i3+12] %asi, v080
+ fxtod v032, v032
+ ldda [%i3+10] %asi, v096
+ fxtod v048, v048
+ ldda [%i3+8] %asi, v112
+ fxtod v064, v064
+ fxtod v080, v080
+ fxtod v096, v096
+ fxtod v112, v112
+ fzero u00_hi
+ fzero u32_hi
+',
+` mov -1, %g4
+ ldx [%i3+0], %l0 C vp[0]
+ srlx %g4, 48, xffff C store mask in register `xffff'
+ ldx [%i3+8], %l1 C vp[1]
+
+ and %l0, xffff, %g2
+ stx %g2, [%sp+2223+0]
+ srlx %l0, 16, %g3
+ and %g3, xffff, %g3
+ stx %g3, [%sp+2223+8]
+ srlx %l0, 32, %g2
+ and %g2, xffff, %g2
+ stx %g2, [%sp+2223+16]
+ srlx %l0, 48, %g3
+ stx %g3, [%sp+2223+24]
+ and %l1, xffff, %g2
+ stx %g2, [%sp+2223+32]
+ srlx %l1, 16, %g3
+ and %g3, xffff, %g3
+ stx %g3, [%sp+2223+40]
+ srlx %l1, 32, %g2
+ and %g2, xffff, %g2
+ stx %g2, [%sp+2223+48]
+ srlx %l1, 48, %g3
+ stx %g3, [%sp+2223+56]
+
+ srlx %g4, 32, xffffffff C store mask in register `xffffffff'
+
+ ldd [%sp+2223+0], v000
+ ldd [%sp+2223+8], v016
+ ldd [%sp+2223+16], v032
+ ldd [%sp+2223+24], v048
+ fxtod v000, v000
+ ldd [%sp+2223+32], v064
+ fxtod v016, v016
+ ldd [%sp+2223+40], v080
+ fxtod v032, v032
+ ldd [%sp+2223+48], v096
+ fxtod v048, v048
+ ldd [%sp+2223+56], v112
+ fxtod v064, v064
+ ld [%sp+2223+0], u00_hi C zero u00_hi
+ fxtod v080, v080
+ ld [%sp+2223+0], u32_hi C zero u32_hi
+ fxtod v096, v096
+ fxtod v112, v112
+')
+C Initialization done.
+ mov 0, %g2
+ mov 0, rlimb
+ mov 0, %g4
+ add %i0, -8, %i0 C BOOKKEEPING
+
+C Start software pipeline.
+
+ ld [%i1+4], u00_lo C read low 32 bits of up[i]
+ fxtod u00_hi, u00
+C mid
+ ld [%i1+0], u32_lo C read high 32 bits of up[i]
+ fmuld u00, v000, a000
+ fmuld u00, v016, a016
+ fmuld u00, v032, a032
+ fmuld u00, v048, a048
+ add %i2, -1, %i2 C BOOKKEEPING
+ fmuld u00, v064, p064
+ add %i1, 8, %i1 C BOOKKEEPING
+ fxtod u32_hi, u32
+ fmuld u00, v080, p080
+ fmuld u00, v096, p096a
+ brnz,pt %i2, .L_2_or_more
+ fmuld u00, v112, p112a
+
+.L1: fdtox a000, out000
+ fmuld u32, v000, p000
+ fdtox a016, out016
+ fmuld u32, v016, p016
+ fmovd p064, a064
+ fmuld u32, v032, p032
+ fmovd p080, a080
+ fmuld u32, v048, p048
+ std out000, [%sp+2223+16]
+ faddd p000, a032, a000
+ fmuld u32, v064, p064
+ std out016, [%sp+2223+24]
+ fxtod u00_hi, u00
+ faddd p016, a048, a016
+ fmuld u32, v080, p080
+ faddd p032, a064, a032
+ fmuld u32, v096, p096b
+ faddd p048, a080, a048
+ fmuld u32, v112, p112b
+C mid
+ fdtox a000, out000
+ fdtox a016, out016
+ faddd p064, p096a, a064
+ faddd p080, p112a, a080
+ std out000, [%sp+2223+0]
+ b .L_wd2
+ std out016, [%sp+2223+8]
+
+.L_2_or_more:
+ ld [%i1+4], u00_lo C read low 32 bits of up[i]
+ fdtox a000, out000
+ fmuld u32, v000, p000
+ fdtox a016, out016
+ fmuld u32, v016, p016
+ fmovd p064, a064
+ fmuld u32, v032, p032
+ fmovd p080, a080
+ fmuld u32, v048, p048
+ std out000, [%sp+2223+16]
+ faddd p000, a032, a000
+ fmuld u32, v064, p064
+ std out016, [%sp+2223+24]
+ fxtod u00_hi, u00
+ faddd p016, a048, a016
+ fmuld u32, v080, p080
+ faddd p032, a064, a032
+ fmuld u32, v096, p096b
+ faddd p048, a080, a048
+ fmuld u32, v112, p112b
+C mid
+ ld [%i1+0], u32_lo C read high 32 bits of up[i]
+ fdtox a000, out000
+ fmuld u00, v000, p000
+ fdtox a016, out016
+ fmuld u00, v016, p016
+ faddd p064, p096a, a064
+ fmuld u00, v032, p032
+ faddd p080, p112a, a080
+ fmuld u00, v048, p048
+ add %i2, -1, %i2 C BOOKKEEPING
+ std out000, [%sp+2223+0]
+ faddd p000, a032, a000
+ fmuld u00, v064, p064
+ add %i1, 8, %i1 C BOOKKEEPING
+ std out016, [%sp+2223+8]
+ fxtod u32_hi, u32
+ faddd p016, a048, a016
+ fmuld u00, v080, p080
+ faddd p032, a064, a032
+ fmuld u00, v096, p096a
+ faddd p048, a080, a048
+ brnz,pt %i2, .L_3_or_more
+ fmuld u00, v112, p112a
+
+ b .Lend
+ nop
+
+C 64 32 0
+C . . .
+C . |__rXXX_| 32
+C . |___cy___| 34
+C . |_______i00__| 50
+C |_______i16__| . 50
+
+
+C BEGIN MAIN LOOP
+ .align 16
+.L_3_or_more:
+.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
+ and %g2, xffffffff, %g2
+ fdtox a000, out000
+ fmuld u32, v000, p000
+C
+ lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a016, out016
+ fmuld u32, v016, p016
+C
+ srlx %l5, 32, cy
+ ldx [%sp+2223+16], i00
+ faddd p064, p096b, a064
+ fmuld u32, v032, p032
+C
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+24], i16
+ faddd p080, p112b, a080
+ fmuld u32, v048, p048
+C
+ nop
+ std out000, [%sp+2223+16]
+ faddd p000, a032, a000
+ fmuld u32, v064, p064
+C
+ add i00, r00, rlimb
+ add %i0, 8, %i0 C BOOKKEEPING
+ std out016, [%sp+2223+24]
+ fxtod u00_hi, u00
+C
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ faddd p016, a048, a016
+ fmuld u32, v080, p080
+C
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ faddd p032, a064, a032
+ fmuld u32, v096, p096b
+C
+ stw %l5, [%i0+4]
+ nop
+ faddd p048, a080, a048
+ fmuld u32, v112, p112b
+C midloop
+ ld [%i1+0], u32_lo C read high 32 bits of up[i]
+ and %g2, xffffffff, %g2
+ fdtox a000, out000
+ fmuld u00, v000, p000
+C
+ lduw [%i0+0], r32 C read high 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a016, out016
+ fmuld u00, v016, p016
+C
+ srlx %l5, 32, cy
+ ldx [%sp+2223+0], i00
+ faddd p064, p096a, a064
+ fmuld u00, v032, p032
+C
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+8], i16
+ faddd p080, p112a, a080
+ fmuld u00, v048, p048
+C
+ add %i2, -1, %i2 C BOOKKEEPING
+ std out000, [%sp+2223+0]
+ faddd p000, a032, a000
+ fmuld u00, v064, p064
+C
+ add i00, r32, rlimb
+ add %i1, 8, %i1 C BOOKKEEPING
+ std out016, [%sp+2223+8]
+ fxtod u32_hi, u32
+C
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ faddd p016, a048, a016
+ fmuld u00, v080, p080
+C
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ faddd p032, a064, a032
+ fmuld u00, v096, p096a
+C
+ stw %l5, [%i0+0]
+ faddd p048, a080, a048
+ brnz,pt %i2, .Loop
+ fmuld u00, v112, p112a
+C END MAIN LOOP
+
+C WIND-DOWN PHASE 1
+.Lend: and %g2, xffffffff, %g2
+ fdtox a000, out000
+ fmuld u32, v000, p000
+ lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a016, out016
+ fmuld u32, v016, p016
+ srlx %l5, 32, cy
+ ldx [%sp+2223+16], i00
+ faddd p064, p096b, a064
+ fmuld u32, v032, p032
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+24], i16
+ faddd p080, p112b, a080
+ fmuld u32, v048, p048
+ std out000, [%sp+2223+16]
+ faddd p000, a032, a000
+ fmuld u32, v064, p064
+ add i00, r00, rlimb
+ add %i0, 8, %i0 C BOOKKEEPING
+ std out016, [%sp+2223+24]
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ faddd p016, a048, a016
+ fmuld u32, v080, p080
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ faddd p032, a064, a032
+ fmuld u32, v096, p096b
+ stw %l5, [%i0+4]
+ faddd p048, a080, a048
+ fmuld u32, v112, p112b
+C mid
+ and %g2, xffffffff, %g2
+ fdtox a000, out000
+ lduw [%i0+0], r32 C read high 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a016, out016
+ srlx %l5, 32, cy
+ ldx [%sp+2223+0], i00
+ faddd p064, p096a, a064
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+8], i16
+ faddd p080, p112a, a080
+ std out000, [%sp+2223+0]
+ add i00, r32, rlimb
+ std out016, [%sp+2223+8]
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ stw %l5, [%i0+0]
+
+C WIND-DOWN PHASE 2
+.L_wd2: and %g2, xffffffff, %g2
+ fdtox a032, out000
+ lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a048, out016
+ srlx %l5, 32, cy
+ ldx [%sp+2223+16], i00
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+24], i16
+ std out000, [%sp+2223+16]
+ add i00, r00, rlimb
+ add %i0, 8, %i0 C BOOKKEEPING
+ std out016, [%sp+2223+24]
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ stw %l5, [%i0+4]
+C mid
+ and %g2, xffffffff, %g2
+ fdtox a064, out000
+ lduw [%i0+0], r32 C read high 32 bits of rp[i]
+ add %g2, rlimb, %l5
+ fdtox a080, out016
+ srlx %l5, 32, cy
+ ldx [%sp+2223+0], i00
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+8], i16
+ std out000, [%sp+2223+0]
+ add i00, r32, rlimb
+ std out016, [%sp+2223+8]
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ stw %l5, [%i0+0]
+
+C WIND-DOWN PHASE 3
+.L_wd3: and %g2, xffffffff, %g2
+ fdtox p096b, out000
+ add %g2, rlimb, %l5
+ fdtox p112b, out016
+ srlx %l5, 32, cy
+ ldx [%sp+2223+16], rlimb
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+24], i16
+ std out000, [%sp+2223+16]
+ add %i0, 8, %i0 C BOOKKEEPING
+ std out016, [%sp+2223+24]
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ stw %l5, [%i0+4]
+C mid
+ and %g2, xffffffff, %g2
+ add %g2, rlimb, %l5
+ srlx %l5, 32, cy
+ ldx [%sp+2223+0], rlimb
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+8], i16
+ sllx i16, 16, %g2
+ add cy, rlimb, rlimb
+ srlx i16, 16, %g4
+ add %g2, rlimb, %l5
+ stw %l5, [%i0+0]
+
+ and %g2, xffffffff, %g2
+ add %g2, rlimb, %l5
+ srlx %l5, 32, cy
+ ldx [%sp+2223+16], i00
+ add %g4, cy, cy C new cy
+ ldx [%sp+2223+24], i16
+
+ sllx i16, 16, %g2
+ add i00, cy, cy
+ return %i7+8
+ add %g2, cy, %o0
+EPILOGUE(mpn_addmul_2)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm
new file mode 100644
index 0000000..47286d5
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm
@@ -0,0 +1,165 @@
+dnl SPARC v9 mpn_lshiftc
+
+dnl Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 3
+C UltraSPARC 3: 2.67
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n', `%i2')
+define(`cnt',`%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+
+define(`tnc',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_lshiftc)
+ save %sp,-160,%sp
+
+ sllx n,3,%g1
+ sub %g0,cnt,tnc C negate shift count
+ add up,%g1,up C make %o1 point at end of src
+ add rp,%g1,rp C make %o0 point at end of res
+ ldx [up-8],u3 C load first limb
+ subcc n,5,n
+ srlx u3,tnc,%i5 C compute function result
+ bl,pn %xcc,.Lend1234
+ sllx u3,cnt,%g3
+
+ subcc n,4,n
+ ldx [up-16],u0
+ ldx [up-24],u1
+ add up,-32,up
+ ldx [up-0],u2
+ ldx [up-8],u3
+ srlx u0,tnc,%g2
+ bl,pn %xcc,.Lend5678
+ not %g3, %g3
+
+ b,a .Loop
+ ALIGN(16)
+.Loop:
+ sllx u0,cnt,%g1
+ andn %g3,%g2,%g3
+ ldx [up-16],u0
+ fanop
+C --
+ srlx u1,tnc,%g2
+ subcc n,4,n
+ stx %g3,[rp-8]
+ not %g1, %g1
+C --
+ sllx u1,cnt,%g3
+ andn %g1,%g2,%g1
+ ldx [up-24],u1
+ fanop
+C --
+ srlx u2,tnc,%g2
+ stx %g1,[rp-16]
+ add up,-32,up
+ not %g3, %g3
+C --
+ sllx u2,cnt,%g1
+ andn %g3,%g2,%g3
+ ldx [up-0],u2
+ fanop
+C --
+ srlx u3,tnc,%g2
+ stx %g3,[rp-24]
+ add rp,-32,rp
+ not %g1, %g1
+C --
+ sllx u3,cnt,%g3
+ andn %g1,%g2,%g1
+ ldx [up-8],u3
+ fanop
+C --
+ srlx u0,tnc,%g2
+ stx %g1,[rp-0]
+ bge,pt %xcc,.Loop
+ not %g3, %g3
+C --
+.Lend5678:
+ sllx u0,cnt,%g1
+ andn %g3,%g2,%g3
+ srlx u1,tnc,%g2
+ stx %g3,[rp-8]
+ not %g1, %g1
+ sllx u1,cnt,%g3
+ andn %g1,%g2,%g1
+ srlx u2,tnc,%g2
+ stx %g1,[rp-16]
+ not %g3, %g3
+ sllx u2,cnt,%g1
+ andn %g3,%g2,%g3
+ srlx u3,tnc,%g2
+ stx %g3,[rp-24]
+ add rp,-32,rp
+ not %g1, %g1
+ sllx u3,cnt,%g3 C carry...
+ andn %g1,%g2,%g1
+ stx %g1,[rp-0]
+
+.Lend1234:
+ addcc n,4,n
+ bz,pn %xcc,.Lret
+ fanop
+.Loop0:
+ add rp,-8,rp
+ subcc n,1,n
+ ldx [up-16],u3
+ add up,-8,up
+ srlx u3,tnc,%g2
+ not %g3, %g3
+ andn %g3,%g2,%g3
+ stx %g3,[rp]
+ sllx u3,cnt,%g3
+ bnz,pt %xcc,.Loop0
+ fanop
+.Lret:
+ not %g3, %g3
+ stx %g3,[rp-8]
+ mov %i5,%i0
+ ret
+ restore
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm
new file mode 100644
index 0000000..871d562
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm
@@ -0,0 +1,580 @@
+dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 14
+C UltraSPARC 3: 18.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the s1 operand split
+C into 32-bit pieces. We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C 1. Align the stack area where we transfer the four 49-bit product-sums
+C to a 32-byte boundary. That would minimize the cache collision.
+C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
+C be to align the area to map to the area immediately before s1?)
+C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C develop mpn_addmul_2. This would save many integer instructions.
+C 3. Unrolling. Questionable if it is worth the code expansion, given that
+C it could only save 1 cycle/limb.
+C 4. Specialize for particular v values. If its upper 32 bits are zero, we
+C could save many operations, in the FPU (fmuld), but more so in the IEU
+C since we'll be summing 48-bit quantities, which might be simpler.
+C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
+C not be greater than needed for L2 cache latency, and also not so great
+C that i16 needs to be copied.
+C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
+C ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C 8 FM
+C 10 FA
+C 11 MEM
+C 9 ISHIFT + 10? IADDLOG
+C 1 BRANCH
+C 49 insns totally (plus three mov insns that should be optimized out)
+
+C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain 3.79 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_mul_1)
+
+C Initialization. (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
+C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+ save %sp, -256, %sp
+ mov -1, %g4
+ srlx %g4, 48, xffff C store mask in register `xffff'
+ and %i3, xffff, %g2
+ stx %g2, [%sp+2223+0]
+ srlx %i3, 16, %g3
+ and %g3, xffff, %g3
+ stx %g3, [%sp+2223+8]
+ srlx %i3, 32, %g2
+ and %g2, xffff, %g2
+ stx %g2, [%sp+2223+16]
+ srlx %i3, 48, %g3
+ stx %g3, [%sp+2223+24]
+ srlx %g4, 32, xffffffff C store mask in register `xffffffff'
+
+ sllx %i2, 3, %i2
+ mov 0, cy C clear cy
+ add %i0, %i2, %i0
+ add %i1, %i2, %i1
+ neg %i2
+ add %i1, 4, %i5
+ add %i0, -32, %i4
+ add %i0, -16, %i0
+
+ ldd [%sp+2223+0], v00
+ ldd [%sp+2223+8], v16
+ ldd [%sp+2223+16], v32
+ ldd [%sp+2223+24], v48
+ ld [%sp+2223+0],%f2 C zero f2
+ ld [%sp+2223+0],%f4 C zero f4
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fxtod v00, v00
+ fxtod v16, v16
+ fxtod v32, v32
+ fxtod v48, v48
+
+C Start real work. (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+ fxtod %f2, u00
+ fxtod %f4, u32
+ fmuld u00, v00, a00
+ fmuld u00, v16, a16
+ fmuld u00, v32, p32
+ fmuld u32, v00, r32
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_two_or_more
+ fmuld u32, v16, r48
+
+.L_one:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ fdtox a32, a32
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ std a32, [%sp+2223+16]
+ std a48, [%sp+2223+24]
+ add %i2, 8, %i2
+
+ fdtox r64, a00
+ fdtox r80, a16
+ ldx [%sp+2223+0], i00
+ ldx [%sp+2223+8], i16
+ ldx [%sp+2223+16], i32
+ ldx [%sp+2223+24], i48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ add %i2, 8, %i2
+
+ mov i00, %g5 C i00+ now in g5
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ sllx i48, 32, %l6 C (i48 << 32)
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_1
+ add %i2, 8, %i2
+
+.L_two_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ fdtox a32, a32
+ fxtod %f2, u00
+ fxtod %f4, u32
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_three_or_more
+ fmuld u32, v16, r48
+
+.L_two:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ ldx [%sp+2223+8], i16
+ ldx [%sp+2223+16], i32
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ std a16, [%sp+2223+8]
+ std a32, [%sp+2223+16]
+ std a48, [%sp+2223+24]
+ add %i2, 8, %i2
+
+ fdtox r64, a00
+ mov i00, %g5 C i00+ now in g5
+ fdtox r80, a16
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_2
+ add %i2, 8, %i2
+
+.L_three_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ faddd p48, r48, a48
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .L_four_or_more
+ fmuld u32, v16, r48
+
+.L_three:
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ fdtox a00, a00
+ faddd p48, r48, a48
+ mov i00, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ b .L_out_3
+ add %i2, 8, %i2
+
+.L_four_or_more:
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+ faddd p48, r48, a48
+ mov i00, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+ sllx i48, 32, %l6 C (i48 << 32)
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .Loop
+ fmuld u32, v16, r48
+
+.L_four:
+ b,a .L_out_4
+
+C BEGIN MAIN LOOP
+ .align 16
+.Loop:
+C 00
+ srlx %o4, 16, %o5 C (x >> 16)
+ ld [%i5+%i2], %f3 C read low 32 bits of up[i]
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+C 01
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ ld [%i1+%i2], %f5 C read high 32 bits of up[i]
+ fdtox a00, a00
+C 02
+ faddd p48, r48, a48
+C 03
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ mov i00, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+C 04
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+C 05
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ fxtod %f2, u00
+C 06
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ fxtod %f4, u32
+C 07
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+C 08
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ fmuld u00, v00, p00
+C 09
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ fmuld u00, v16, p16
+C 10
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ fmuld u00, v32, p32
+C 11
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ faddd p00, r64, a00
+ fmuld u32, v00, r32
+C 12
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ faddd p16, r80, a16
+ fmuld u00, v48, p48
+C 13
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ addcc %i2, 8, %i2
+ bnz,pt %xcc, .Loop
+ fmuld u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+ srlx %o4, 16, %o5 C (x >> 16)
+ fmuld u32, v32, r64 C FIXME not urgent
+ faddd p32, r32, a32
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ fdtox a00, a00
+ faddd p48, r48, a48
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ mov i00, %g5 C i00+ now in g5
+ fmuld u32, v48, r80 C FIXME not urgent
+ fdtox a16, a16
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ fdtox a32, a32
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ fdtox a48, a48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ std a32, [%sp+2223+16]
+ add %l6, %o2, %o2 C mi64- in %o2
+ std a48, [%sp+2223+24]
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_3:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ fdtox r64, a00
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ mov i00, %g5 C i00+ now in g5
+ fdtox r80, a16
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ ldx [%sp+2223+16], i32
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ ldx [%sp+2223+24], i48
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ std a00, [%sp+2223+0]
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ std a16, [%sp+2223+8]
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_2:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ mov i00, %g5 C i00+ now in g5
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ ldx [%sp+2223+0], i00
+ srlx i16, 48, %l4 C (i16 >> 48)
+ mov i16, %g2
+ ldx [%sp+2223+8], i16
+ srlx i48, 16, %l5 C (i48 >> 16)
+ mov i32, %g4 C i32+ now in g4
+ sllx i48, 32, %l6 C (i48 << 32)
+ or %i3, %o5, %o5
+ srlx %g4, 32, %o3 C (i32 >> 32)
+ add %l5, %l4, %o1 C hi64- in %o1
+ sllx %g4, 16, %o2 C (i32 << 16)
+ add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
+ sllx %o1, 48, %o3 C (hi64 << 48)
+ add %g2, %o2, %o2 C mi64- in %o2
+ add %l6, %o2, %o2 C mi64- in %o2
+ sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
+ stx %o5, [%i4+%i2]
+ add cy, %g5, %o4 C x = prev(i00) + cy
+ add %i2, 8, %i2
+.L_out_1:
+ srlx %o4, 16, %o5 C (x >> 16)
+ add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
+ and %o4, xffff, %o5 C (x & 0xffff)
+ srlx %o2, 48, %o7 C (mi64 >> 48)
+ sllx %o2, 16, %i3 C (mi64 << 16)
+ add %o7, %o1, cy C new cy
+ or %i3, %o5, %o5
+ stx %o5, [%i4+%i2]
+
+ sllx i00, 0, %g2
+ add %g2, cy, cy
+ sllx i16, 16, %g3
+ add %g3, cy, cy
+
+ return %i7+8
+ mov cy, %o0
+EPILOGUE(mpn_mul_1)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
new file mode 100644
index 0000000..43c69d3
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
@@ -0,0 +1,342 @@
+dnl SPARC v9 64-bit mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 22
+C UltraSPARC 3: 36
+
+C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the
+C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal
+C code using the same algorithm. For 1-3 limbs, a special loop was generated,
+C which causes performance problems in particular for 2 and 3 limbs.
+C Ultimately, this should be replaced by hand-written code in the same software
+C pipeline style as e.g., addmul_1.asm.
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sqr_diagonal)
+ save %sp, -240, %sp
+
+ sethi %hi(0x1ffc00), %o0
+ sethi %hi(0x3ffc00), %o1
+ add %o0, 1023, %o7
+ cmp %i2, 4
+ add %o1, 1023, %o4
+ or %g0, %i1, %g1
+ or %g0, %i0, %o0
+ bl,pn %xcc, .Lsmall
+ or %g0, 0, %g2
+
+ ldx [%i1], %o1
+ add %i1, 24, %g1
+ or %g0, 3, %g2
+ srlx %o1, 42, %g3
+ stx %g3, [%sp+2279]
+ and %o1, %o7, %o2
+ stx %o2, [%sp+2263]
+ srlx %o1, 21, %o1
+ ldd [%sp+2279], %f0
+ and %o1, %o7, %o1
+ stx %o1, [%sp+2271]
+ ldx [%i1+8], %o2
+ fxtod %f0, %f12
+ srlx %o2, 21, %o1
+ and %o2, %o7, %g3
+ ldd [%sp+2263], %f2
+ fmuld %f12, %f12, %f10
+ srlx %o2, 42, %o2
+ ldd [%sp+2271], %f0
+ and %o1, %o7, %o1
+ fxtod %f2, %f8
+ stx %o2, [%sp+2279]
+ stx %o1, [%sp+2271]
+ fxtod %f0, %f0
+ stx %g3, [%sp+2263]
+ fdtox %f10, %f14
+ fmuld %f12, %f8, %f6
+ ldx [%i1+16], %o2
+ std %f14, [%sp+2255]
+ fmuld %f0, %f0, %f2
+ fmuld %f8, %f8, %f10
+ srlx %o2, 42, %o1
+ faddd %f6, %f6, %f6
+ fmuld %f12, %f0, %f12
+ fmuld %f0, %f8, %f8
+ ldd [%sp+2279], %f0
+ ldd [%sp+2263], %f4
+ fdtox %f10, %f10
+ std %f10, [%sp+2239]
+ faddd %f2, %f6, %f6
+ ldd [%sp+2271], %f2
+ fdtox %f12, %f12
+ std %f12, [%sp+2247]
+ fdtox %f8, %f8
+ std %f8, [%sp+2231]
+ fdtox %f6, %f6
+ std %f6, [%sp+2223]
+
+.Loop: srlx %o2, 21, %g3
+ stx %o1, [%sp+2279]
+ add %g2, 1, %g2
+ and %g3, %o7, %o1
+ ldx [%sp+2255], %g4
+ cmp %g2, %i2
+ stx %o1, [%sp+2271]
+ add %g1, 8, %g1
+ add %o0, 16, %o0
+ ldx [%sp+2239], %o1
+ fxtod %f0, %f10
+ fxtod %f4, %f14
+ ldx [%sp+2231], %i0
+ ldx [%sp+2223], %g5
+ ldx [%sp+2247], %g3
+ and %o2, %o7, %o2
+ fxtod %f2, %f8
+ fmuld %f10, %f10, %f0
+ stx %o2, [%sp+2263]
+ fmuld %f10, %f14, %f6
+ ldx [%g1-8], %o2
+ fmuld %f10, %f8, %f12
+ fdtox %f0, %f2
+ ldd [%sp+2279], %f0
+ fmuld %f8, %f8, %f4
+ faddd %f6, %f6, %f6
+ fmuld %f14, %f14, %f10
+ std %f2, [%sp+2255]
+ sllx %g4, 20, %g4
+ ldd [%sp+2271], %f2
+ fmuld %f8, %f14, %f8
+ sllx %i0, 22, %i1
+ fdtox %f12, %f12
+ std %f12, [%sp+2247]
+ sllx %g5, 42, %i0
+ add %o1, %i1, %o1
+ faddd %f4, %f6, %f6
+ ldd [%sp+2263], %f4
+ add %o1, %i0, %o1
+ add %g3, %g4, %g3
+ fdtox %f10, %f10
+ std %f10, [%sp+2239]
+ srlx %o1, 42, %g4
+ and %g5, %o4, %i0
+ fdtox %f8, %f8
+ std %f8, [%sp+2231]
+ srlx %g5, 22, %g5
+ sub %g4, %i0, %g4
+ fdtox %f6, %f6
+ std %f6, [%sp+2223]
+ srlx %g4, 63, %g4
+ add %g3, %g5, %g3
+ add %g3, %g4, %g3
+ stx %o1, [%o0-16]
+ srlx %o2, 42, %o1
+ bl,pt %xcc, .Loop
+ stx %g3, [%o0-8]
+
+ stx %o1, [%sp+2279]
+ srlx %o2, 21, %o1
+ fxtod %f0, %f16
+ ldx [%sp+2223], %g3
+ fxtod %f4, %f6
+ and %o2, %o7, %o3
+ stx %o3, [%sp+2263]
+ fxtod %f2, %f4
+ and %o1, %o7, %o1
+ ldx [%sp+2231], %o2
+ sllx %g3, 42, %g4
+ fmuld %f16, %f16, %f14
+ stx %o1, [%sp+2271]
+ fmuld %f16, %f6, %f8
+ add %o0, 48, %o0
+ ldx [%sp+2239], %o1
+ sllx %o2, 22, %o2
+ fmuld %f4, %f4, %f10
+ ldx [%sp+2255], %o3
+ fdtox %f14, %f14
+ fmuld %f4, %f6, %f2
+ std %f14, [%sp+2255]
+ faddd %f8, %f8, %f12
+ add %o1, %o2, %o2
+ fmuld %f16, %f4, %f4
+ ldd [%sp+2279], %f0
+ sllx %o3, 20, %g5
+ add %o2, %g4, %o2
+ fmuld %f6, %f6, %f6
+ srlx %o2, 42, %o3
+ and %g3, %o4, %g4
+ srlx %g3, 22, %g3
+ faddd %f10, %f12, %f16
+ ldd [%sp+2271], %f12
+ ldd [%sp+2263], %f8
+ fxtod %f0, %f0
+ sub %o3, %g4, %o3
+ ldx [%sp+2247], %o1
+ srlx %o3, 63, %o3
+ fdtox %f2, %f10
+ fxtod %f8, %f8
+ std %f10, [%sp+2231]
+ fdtox %f6, %f6
+ std %f6, [%sp+2239]
+ add %o1, %g5, %o1
+ fmuld %f0, %f0, %f2
+ fdtox %f16, %f16
+ std %f16, [%sp+2223]
+ add %o1, %g3, %o1
+ fdtox %f4, %f4
+ std %f4, [%sp+2247]
+ fmuld %f0, %f8, %f10
+ fxtod %f12, %f12
+ add %o1, %o3, %o1
+ stx %o2, [%o0-48]
+ fmuld %f8, %f8, %f6
+ stx %o1, [%o0-40]
+ fdtox %f2, %f2
+ ldx [%sp+2231], %o2
+ faddd %f10, %f10, %f10
+ ldx [%sp+2223], %g3
+ fmuld %f12, %f12, %f4
+ fdtox %f6, %f6
+ ldx [%sp+2239], %o1
+ sllx %o2, 22, %o2
+ fmuld %f12, %f8, %f8
+ sllx %g3, 42, %g5
+ ldx [%sp+2255], %o3
+ fmuld %f0, %f12, %f0
+ add %o1, %o2, %o2
+ faddd %f4, %f10, %f4
+ ldx [%sp+2247], %o1
+ add %o2, %g5, %o2
+ and %g3, %o4, %g4
+ fdtox %f8, %f8
+ sllx %o3, 20, %g5
+ std %f8, [%sp+2231]
+ fdtox %f0, %f0
+ srlx %o2, 42, %o3
+ add %o1, %g5, %o1
+ fdtox %f4, %f4
+ srlx %g3, 22, %g3
+ sub %o3, %g4, %o3
+ std %f6, [%sp+2239]
+ std %f4, [%sp+2223]
+ srlx %o3, 63, %o3
+ add %o1, %g3, %o1
+ std %f2, [%sp+2255]
+ add %o1, %o3, %o1
+ std %f0, [%sp+2247]
+ stx %o2, [%o0-32]
+ stx %o1, [%o0-24]
+ ldx [%sp+2231], %o2
+ ldx [%sp+2223], %o3
+ ldx [%sp+2239], %o1
+ sllx %o2, 22, %o2
+ sllx %o3, 42, %g5
+ ldx [%sp+2255], %g4
+ and %o3, %o4, %g3
+ add %o1, %o2, %o2
+ ldx [%sp+2247], %o1
+ add %o2, %g5, %o2
+ stx %o2, [%o0-16]
+ sllx %g4, 20, %g4
+ srlx %o2, 42, %o2
+ add %o1, %g4, %o1
+ srlx %o3, 22, %o3
+ sub %o2, %g3, %o2
+ srlx %o2, 63, %o2
+ add %o1, %o3, %o1
+ add %o1, %o2, %o1
+ stx %o1, [%o0-8]
+ ret
+ restore %g0, %g0, %g0
+.Lsmall:
+ ldx [%g1], %o2
+.Loop0:
+ and %o2, %o7, %o1
+ stx %o1, [%sp+2263]
+ add %g2, 1, %g2
+ srlx %o2, 21, %o1
+ add %g1, 8, %g1
+ srlx %o2, 42, %o2
+ stx %o2, [%sp+2279]
+ and %o1, %o7, %o1
+ ldd [%sp+2263], %f0
+ cmp %g2, %i2
+ stx %o1, [%sp+2271]
+ fxtod %f0, %f6
+ ldd [%sp+2279], %f0
+ ldd [%sp+2271], %f4
+ fxtod %f0, %f2
+ fmuld %f6, %f6, %f0
+ fxtod %f4, %f10
+ fmuld %f2, %f6, %f4
+ fdtox %f0, %f0
+ std %f0, [%sp+2239]
+ fmuld %f10, %f6, %f8
+ fmuld %f10, %f10, %f0
+ faddd %f4, %f4, %f6
+ fmuld %f2, %f2, %f4
+ fdtox %f8, %f8
+ std %f8, [%sp+2231]
+ fmuld %f2, %f10, %f2
+ faddd %f0, %f6, %f0
+ fdtox %f4, %f4
+ std %f4, [%sp+2255]
+ fdtox %f2, %f2
+ std %f2, [%sp+2247]
+ fdtox %f0, %f0
+ std %f0, [%sp+2223]
+ ldx [%sp+2239], %o1
+ ldx [%sp+2255], %g4
+ ldx [%sp+2231], %o2
+ sllx %g4, 20, %g4
+ ldx [%sp+2223], %o3
+ sllx %o2, 22, %o2
+ sllx %o3, 42, %g5
+ add %o1, %o2, %o2
+ ldx [%sp+2247], %o1
+ add %o2, %g5, %o2
+ stx %o2, [%o0]
+ and %o3, %o4, %g3
+ srlx %o2, 42, %o2
+ add %o1, %g4, %o1
+ srlx %o3, 22, %o3
+ sub %o2, %g3, %o2
+ srlx %o2, 63, %o2
+ add %o1, %o3, %o1
+ add %o1, %o2, %o1
+ stx %o1, [%o0+8]
+ add %o0, 16, %o0
+ bl,a,pt %xcc, .Loop0
+ ldx [%g1], %o2
+ ret
+ restore %g0, %g0, %g0
+EPILOGUE(mpn_sqr_diagonal)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm
new file mode 100644
index 0000000..9fb7f70
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm
@@ -0,0 +1,241 @@
+dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector.
+
+dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 4
+C UltraSPARC 3: 4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u-v-carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`vp',`%i2')
+define(`n',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+define(`v0',`%l1')
+define(`v1',`%l3')
+define(`v2',`%l5')
+define(`v3',`%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sub_nc)
+ save %sp,-160,%sp
+
+ fitod %f0,%f0 C make sure f0 contains small, quiet number
+ subcc n,4,%g0
+ bl,pn %xcc,.Loop0
+ nop
+ b,a L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_sub_n)
+ save %sp,-160,%sp
+
+ fitod %f0,%f0 C make sure f0 contains small, quiet number
+ subcc n,4,%g0
+ bl,pn %xcc,.Loop0
+ mov 0,cy
+L(com):
+ ldx [up+0],u0
+ ldx [vp+0],v0
+ add up,32,up
+ ldx [up-24],u1
+ ldx [vp+8],v1
+ add vp,32,vp
+ ldx [up-16],u2
+ ldx [vp-16],v2
+ ldx [up-8],u3
+ ldx [vp-8],v3
+ subcc n,8,n
+ sub u0,v0,%g1 C main sub
+ sub %g1,cy,%g5 C carry sub
+ orn u0,v0,%g2
+ bl,pn %xcc,.Lend4567
+ fanop
+ b,a .Loop
+
+ .align 16
+C START MAIN LOOP
+.Loop: orn %g5,%g2,%g2
+ andn u0,v0,%g3
+ ldx [up+0],u0
+ fanop
+C --
+ andn %g2,%g3,%g2
+ ldx [vp+0],v0
+ add up,32,up
+ fanop
+C --
+ srlx %g2,63,cy
+ sub u1,v1,%g1
+ stx %g5,[rp+0]
+ fanop
+C --
+ sub %g1,cy,%g5
+ orn u1,v1,%g2
+ fmnop
+ fanop
+C --
+ orn %g5,%g2,%g2
+ andn u1,v1,%g3
+ ldx [up-24],u1
+ fanop
+C --
+ andn %g2,%g3,%g2
+ ldx [vp+8],v1
+ add vp,32,vp
+ fanop
+C --
+ srlx %g2,63,cy
+ sub u2,v2,%g1
+ stx %g5,[rp+8]
+ fanop
+C --
+ sub %g1,cy,%g5
+ orn u2,v2,%g2
+ fmnop
+ fanop
+C --
+ orn %g5,%g2,%g2
+ andn u2,v2,%g3
+ ldx [up-16],u2
+ fanop
+C --
+ andn %g2,%g3,%g2
+ ldx [vp-16],v2
+ add rp,32,rp
+ fanop
+C --
+ srlx %g2,63,cy
+ sub u3,v3,%g1
+ stx %g5,[rp-16]
+ fanop
+C --
+ sub %g1,cy,%g5
+ orn u3,v3,%g2
+ fmnop
+ fanop
+C --
+ orn %g5,%g2,%g2
+ andn u3,v3,%g3
+ ldx [up-8],u3
+ fanop
+C --
+ andn %g2,%g3,%g2
+ subcc n,4,n
+ ldx [vp-8],v3
+ fanop
+C --
+ srlx %g2,63,cy
+ sub u0,v0,%g1
+ stx %g5,[rp-8]
+ fanop
+C --
+ sub %g1,cy,%g5
+ orn u0,v0,%g2
+ bge,pt %xcc,.Loop
+ fanop
+C END MAIN LOOP
+.Lend4567:
+ orn %g5,%g2,%g2
+ andn u0,v0,%g3
+ andn %g2,%g3,%g2
+ srlx %g2,63,cy
+ sub u1,v1,%g1
+ stx %g5,[rp+0]
+ sub %g1,cy,%g5
+ orn u1,v1,%g2
+ orn %g5,%g2,%g2
+ andn u1,v1,%g3
+ andn %g2,%g3,%g2
+ srlx %g2,63,cy
+ sub u2,v2,%g1
+ stx %g5,[rp+8]
+ sub %g1,cy,%g5
+ orn u2,v2,%g2
+ orn %g5,%g2,%g2
+ andn u2,v2,%g3
+ andn %g2,%g3,%g2
+ add rp,32,rp
+ srlx %g2,63,cy
+ sub u3,v3,%g1
+ stx %g5,[rp-16]
+ sub %g1,cy,%g5
+ orn u3,v3,%g2
+ orn %g5,%g2,%g2
+ andn u3,v3,%g3
+ andn %g2,%g3,%g2
+ srlx %g2,63,cy
+ stx %g5,[rp-8]
+
+ addcc n,4,n
+ bz,pn %xcc,.Lret
+ fanop
+
+.Loop0: ldx [up],u0
+ add up,8,up
+ ldx [vp],v0
+ add vp,8,vp
+ add rp,8,rp
+ subcc n,1,n
+ sub u0,v0,%g1
+ orn u0,v0,%g2
+ sub %g1,cy,%g5
+ andn u0,v0,%g3
+ orn %g5,%g2,%g2
+ stx %g5,[rp-8]
+ andn %g2,%g3,%g2
+ bnz,pt %xcc,.Loop0
+ srlx %g2,63,cy
+
+.Lret: mov cy,%i0
+ ret
+ restore
+EPILOGUE(mpn_sub_n)
diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm
new file mode 100644
index 0000000..0bdb566
--- /dev/null
+++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm
@@ -0,0 +1,68 @@
+dnl SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 18
+C UltraSPARC 3: 23
+
+C INPUT PARAMETERS
+C rp i0
+C up i1
+C n i2
+C v i3
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+
+PROLOGUE(mpn_submul_1)
+ save %sp,-176,%sp
+
+ sllx %i2, 3, %g2
+ or %g0, %i1, %o1
+ add %g2, 15, %o0
+ or %g0, %i2, %o2
+ and %o0, -16, %o0
+ sub %sp, %o0, %sp
+ add %sp, 2223, %o0
+ or %g0, %o0, %l0
+ call mpn_mul_1
+ or %g0, %i3, %o3
+ or %g0, %o0, %l1 C preserve carry value from mpn_mul_1
+ or %g0, %i0, %o0
+ or %g0, %i0, %o1
+ or %g0, %l0, %o2
+ call mpn_sub_n
+ or %g0, %i2, %o3
+ ret
+ restore %l1, %o0, %o0 C sum carry values
+EPILOGUE(mpn_submul_1)