From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/sparc32/README | 71 ++++ gmp-6.3.0/mpn/sparc32/add_n.asm | 245 +++++++++++ gmp-6.3.0/mpn/sparc32/addmul_1.asm | 155 +++++++ gmp-6.3.0/mpn/sparc32/gmp-mparam.h | 67 +++ gmp-6.3.0/mpn/sparc32/lshift.asm | 105 +++++ gmp-6.3.0/mpn/sparc32/mul_1.asm | 146 +++++++ gmp-6.3.0/mpn/sparc32/rshift.asm | 102 +++++ gmp-6.3.0/mpn/sparc32/sparc-defs.m4 | 97 +++++ gmp-6.3.0/mpn/sparc32/sub_n.asm | 335 +++++++++++++++ gmp-6.3.0/mpn/sparc32/submul_1.asm | 155 +++++++ gmp-6.3.0/mpn/sparc32/udiv.asm | 147 +++++++ gmp-6.3.0/mpn/sparc32/udiv_nfp.asm | 202 +++++++++ gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm | 70 ++++ gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm | 90 ++++ gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h | 153 +++++++ gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm | 83 ++++ .../mpn/sparc32/ultrasparct1/sqr_diagonal.asm | 55 +++ gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm | 70 ++++ gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm | 91 ++++ gmp-6.3.0/mpn/sparc32/umul.asm | 77 ++++ gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm | 109 +++++ gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h | 73 ++++ gmp-6.3.0/mpn/sparc32/v8/mul_1.asm | 93 +++++ gmp-6.3.0/mpn/sparc32/v8/submul_1.asm | 67 +++ gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h | 73 ++++ gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm | 131 ++++++ gmp-6.3.0/mpn/sparc32/v8/udiv.asm | 131 ++++++ gmp-6.3.0/mpn/sparc32/v8/umul.asm | 40 ++ gmp-6.3.0/mpn/sparc32/v9/README | 4 + gmp-6.3.0/mpn/sparc32/v9/add_n.asm | 129 ++++++ gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm | 306 ++++++++++++++ gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h | 204 +++++++++ gmp-6.3.0/mpn/sparc32/v9/mul_1.asm | 287 +++++++++++++ gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm | 462 +++++++++++++++++++++ gmp-6.3.0/mpn/sparc32/v9/sub_n.asm | 129 ++++++ gmp-6.3.0/mpn/sparc32/v9/submul_1.asm | 316 ++++++++++++++ gmp-6.3.0/mpn/sparc32/v9/udiv.asm | 52 +++ 37 files changed, 5122 insertions(+) create mode 100644 gmp-6.3.0/mpn/sparc32/README create mode 100644 gmp-6.3.0/mpn/sparc32/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/lshift.asm create mode 100644 gmp-6.3.0/mpn/sparc32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/rshift.asm create mode 100644 gmp-6.3.0/mpn/sparc32/sparc-defs.m4 create mode 100644 gmp-6.3.0/mpn/sparc32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/udiv_nfp.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/umul.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v8/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/umul.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/README create mode 100644 gmp-6.3.0/mpn/sparc32/v9/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v9/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/udiv.asm (limited to 'gmp-6.3.0/mpn/sparc32') diff --git a/gmp-6.3.0/mpn/sparc32/README b/gmp-6.3.0/mpn/sparc32/README new file mode 100644 index 0000000..f2dd116 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/README @@ -0,0 +1,71 @@ +Copyright 1996, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for various SPARC chips. Code that +runs only on version 8 SPARC implementations, is in the v8 subdirectory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On most early SPARC implementations, the ST instructions takes multiple +cycles, while a STD takes just a single cycle more than an ST. For the CPUs +in SPARCstation I and II, the times are 3 and 4 cycles, respectively. +Therefore, combining two ST instructions into a STD when possible is a +significant optimization. + +Later SPARC implementations have single cycle ST. + +For SuperSPARC, we can perform just one memory instruction per cycle, even +if up to two integer instructions can be executed in its pipeline. For +programs that perform so many memory operations that there are not enough +non-memory operations to issue in parallel with all memory operations, using +LDD and STD when possible helps. + +UltraSPARC-1/2 has very slow integer multiplication. In the v9 subdirectory, +we therefore use floating-point multiplication. + +STATUS + +1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5 + cycles/limb asymptotically. We could optimize speed for special counts + by using ADDXCC. + +2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2 + cycles/limb asymptotically. + +3. mpn_mul_1 runs at what is believed to be optimal speed. + +4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a + cycle by avoiding one of the add instructions. See a29k/addmul_1. + +The speed of the code for other SPARC implementations is uncertain. diff --git a/gmp-6.3.0/mpn/sparc32/add_n.asm b/gmp-6.3.0/mpn/sparc32/add_n.asm new file mode 100644 index 0000000..8549195 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/add_n.asm @@ -0,0 +1,245 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_add_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** +L(0): andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + addxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + addxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + addxcc %g4,%g2,%o4 + addxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + mov s2_ptr,%g1 + mov s1_ptr,s2_ptr + b L(0) + mov %g1,s1_ptr + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/sparc32/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/addmul_1.asm new file mode 100644 index 0000000..92d5d78 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/addmul_1.asm @@ -0,0 +1,155 @@ +dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + addcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + addcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + addcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + addcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/gmp-mparam.h new file mode 100644 index 0000000..a3bc612 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/gmp-mparam.h @@ -0,0 +1,67 @@ +/* SPARC v7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* Generated by tuneup.c, 2002-03-13, gcc 2.95, Weitek 8701 */ + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 466 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 16 +#define SQR_TOOM3_THRESHOLD 258 + +#define DIV_SB_PREINV_THRESHOLD 4 +#define DIV_DC_THRESHOLD 28 +#define POWM_THRESHOLD 28 + +#define GCD_ACCEL_THRESHOLD 3 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 120 +#define MODEXACT_1_ODD_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define GET_STR_DC_THRESHOLD 21 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_THRESHOLD 1012 + +#define MUL_FFT_TABLE { 272, 672, 1152, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 2304 + +#define SQR_FFT_TABLE { 304, 736, 1152, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 248 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/sparc32/lshift.asm b/gmp-6.3.0/mpn/sparc32/lshift.asm new file mode 100644 index 0000000..8321343 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/lshift.asm @@ -0,0 +1,105 @@ +dnl SPARC mpn_lshift -- Shift a number left. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_lshift) + sll %o2,2,%g1 + add %o1,%g1,%o1 C make %o1 point at end of src + ld [%o1-4],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o0,%g1,%o0 C make %o0 point at end of res + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + srl %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1-8],%g3 + add %o0,-4,%o0 + add %o1,-4,%o1 + addcc %g4,-1,%g4 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0+0] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1-8],%g3 + add %o0,-16,%o0 + addcc %o2,-4,%o2 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + + ld [%o1-12],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+12] + srl %g2,%o5,%g1 + + ld [%o1-16],%g3 + sll %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0+8] + srl %g3,%o5,%g1 + + ld [%o1-20],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+4] + srl %g2,%o5,%g1 + + add %o1,-16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0+0] + +L(end): sll %g2,%o3,%g2 + st %g2,[%o0-4] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/sparc32/mul_1.asm b/gmp-6.3.0/mpn/sparc32/mul_1.asm new file mode 100644 index 0000000..42b4168 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/mul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop0) + ld [%o1+%o2],%o5 + + retl + st %g1,[%o4+%o2] + + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 C g2 = S1_LIMB iff S2_LIMB < 0, else 0 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop) + ld [%o1+%o2],%o5 + + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/rshift.asm b/gmp-6.3.0/mpn/sparc32/rshift.asm new file mode 100644 index 0000000..e155476 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/rshift.asm @@ -0,0 +1,102 @@ +dnl SPARC mpn_rshift -- Shift a number right. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_rshift) + ld [%o1],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + sll %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1+4],%g3 + add %o0,4,%o0 + add %o1,4,%o1 + addcc %g4,-1,%g4 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0-4] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1+4],%g3 + add %o0,16,%o0 + addcc %o2,-4,%o2 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + + ld [%o1+8],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-16] + sll %g2,%o5,%g1 + + ld [%o1+12],%g3 + srl %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0-12] + sll %g3,%o5,%g1 + + ld [%o1+16],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-8] + sll %g2,%o5,%g1 + + add %o1,16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0-4] + +L(end): srl %g2,%o3,%g2 + st %g2,[%o0-0] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 b/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 new file mode 100644 index 0000000..fff0ff8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 @@ -0,0 +1,97 @@ +divert(-1) + +dnl m4 macros for SPARC assembler (32 and 64 bit). + + +dnl Copyright 2002, 2011, 2013, 2017, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +changecom(;) dnl cannot use default # since that's used in REGISTER decls + + +dnl Usage: REGISTER(reg,attr) +dnl +dnl Give a ".register reg,attr" directive, if the assembler supports it. +dnl HAVE_REGISTER comes from the GMP_ASM_SPARC_REGISTER configure test. + +define(REGISTER, +m4_assert_numargs(2) +m4_assert_defined(`HAVE_REGISTER') +`ifelse(HAVE_REGISTER,yes, +`.register `$1',`$2'')') + + +C Testing mechanism for running newer code on older processors +ifdef(`FAKE_T3',` + include_mpn(`sparc64/ultrasparct3/missing.m4') +',` + define(`addxccc', ``addxccc' $1, $2, $3') + define(`addxc', ``addxc' $1, $2, $3') + define(`umulxhi', ``umulxhi' $1, $2, $3') + define(`lzcnt', ``lzd' $1, $2') +') + +dnl Usage: LEA64(symbol,reg,pic_reg) +dnl +dnl Use whatever 64-bit code sequence is appropriate to load "symbol" into +dnl register "reg", potentially using register "pic_reg" to perform the +dnl calculations. +dnl +dnl Caveat: We used to use the setx pseudo insn here, but some GNU/Linux +dnl releases causes invalid code or relocs for that. +dnl +dnl Optimisation 1: Use thunk call instead of RDPC which causes pipeline +dnl replay for some sparcs. +dnl +dnl Optimisation 2: Do the two symbol building sequences in parallel instead +dnl of one after the other. That might need one more scratch register. + +define(LEA64, +m4_assert_numargs(3) +`ifdef(`PIC',` + rd %pc, %`$2' + sethi %hi(_GLOBAL_OFFSET_TABLE_+4), %`$3' + add %`$3', %lo(_GLOBAL_OFFSET_TABLE_+8), %`$3' + add %`$2', %`$3', %`$3' +ifelse(HAVE_GDOP,yes,` + sethi %gdop_hix22(`$1'), %`$2' + xor %`$2', %gdop_lox10(`$1'), %`$2' + ldx [%`$3' + %`$2'], %`$2', %gdop(`$1') +',` + sethi %hi(`$1'), %`$2' + or %`$2', %lo(`$1'), %`$2' + ldx [%`$3' + %`$2'], %`$2' +')',` + sethi %h44(`$1'), %`$2' + or %`$2', %m44(`$1'), %`$2' + sllx %`$2', 12, %`$2' + or %`$2', %l44(`$1'), %$2 +')') + +divert diff --git a/gmp-6.3.0/mpn/sparc32/sub_n.asm b/gmp-6.3.0/mpn/sparc32/sub_n.asm new file mode 100644 index 0000000..24a576d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/sub_n.asm @@ -0,0 +1,335 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_sub_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + subxcc %g4,%g2,%o4 + subxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1b) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s1_ptr + ld [s2_ptr],%g4 + add s2_ptr,4,s2_ptr + ld [s1_ptr],%g2 + add s1_ptr,4,s1_ptr + add n,-1,n + subcc %g2,%g4,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1b): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s2_ptr+0],%g4 + addcc n,-10,n + ld [s2_ptr+4],%g1 + ldd [s1_ptr+0],%g2 + blt L(fin1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g2,%g4,%o4 + ld [s2_ptr+16],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+20],%g1 + ldd [s1_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g2,%g4,%o4 + ld [s2_ptr+24],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+28],%g1 + ldd [s1_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g2,%g4,%o4 + ld [s2_ptr+32],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+36],%g1 + ldd [s1_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1b) + subcc %g0,%o4,%g0 C restore cy + +L(fin1b): + addcc n,8-2,n + blt L(end1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1b) + subcc %g0,%o4,%g0 C restore cy +L(end1b): + subxcc %g2,%g4,%o4 + subxcc %g3,%g1,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1b) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s2_ptr+8],%g4 + ld [s1_ptr+8],%g2 + subxcc %g2,%g4,%o4 + st %o4,[res_ptr+8] + +L(ret1b): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc32/submul_1.asm b/gmp-6.3.0/mpn/sparc32/submul_1.asm new file mode 100644 index 0000000..73f9377 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/submul_1.asm @@ -0,0 +1,155 @@ +dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + subcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + subcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + subcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + subcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/udiv.asm b/gmp-6.3.0/mpn/sparc32/udiv.asm new file mode 100644 index 0000000..cbc24b1 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/udiv.asm @@ -0,0 +1,147 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs with a floating-point unit. + +dnl Copyright 1993, 1994, 1996, 2000, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + sethi %hi(0x80000000),%g1 + + sethi %hi(0x41e00000),%i4 + mov 0,%i5 + std %i4,[%fp-8] + ldd [%fp-8],%f12 C 0r2147483648 + faddd %f12,%f12,%f8 C 0r4294967296 + + mov %i0,%i5 + + sub %i1,%g1,%l0 + sub %i2,%g1,%l1 + std %l0,[%fp-8] + ldd [%fp-8],%f10 + + fitod %f10,%f4 + faddd %f4,%f12,%f4 + + fitod %f11,%f2 + faddd %f2,%f12,%f2 + + fmuld %f4,%f8,%f6 + + sub %i3,%g1,%l2 + st %l2,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + fitod %f10,%f4 + faddd %f4,%f12,%f4 + + fdivd %f2,%f4,%f2 + fcmped %f2,%f12 + nop + fbge,a L(1) + fsubd %f2,%f12,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(2) + ld [%fp-8],%i4 +L(1): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + add %i4,%g1,%i4 +L(2): + wr %g0,%i4,%y + sra %i3,31,%g2 + and %i4,%g2,%g2 + andcc %g0,0,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,0,%g1 + add %g1,%g2,%i0 + rd %y,%g3 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(3) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(3): + blu L(4) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(4): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm b/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm new file mode 100644 index 0000000..ebbb820 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm @@ -0,0 +1,202 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs without a floating-point unit. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + tst %o3 + bneg L(largedivisor) + mov 8,%g1 + + b L(p1) + addxcc %o2,%o2,%o2 + +L(plop): + bcc L(n1) + addxcc %o2,%o2,%o2 +L(p1): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n2) + addxcc %o2,%o2,%o2 +L(p2): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n3) + addxcc %o2,%o2,%o2 +L(p3): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n4) + addxcc %o2,%o2,%o2 +L(p4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(plop) + subcc %o1,%o3,%o4 + bcc L(n5) + addxcc %o2,%o2,%o2 +L(p5): st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(nlop): + bcc L(p1) + addxcc %o2,%o2,%o2 +L(n1): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p2) + addxcc %o2,%o2,%o2 +L(n2): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p3) + addxcc %o2,%o2,%o2 +L(n3): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p4) + addxcc %o2,%o2,%o2 +L(n4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(nlop) + subcc %o4,%o3,%o1 + bcc L(p5) + addxcc %o2,%o2,%o2 +L(n5): st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(largedivisor): + and %o2,1,%o5 C %o5 = n0 & 1 + + srl %o2,1,%o2 + sll %o1,31,%g2 + or %g2,%o2,%o2 C %o2 = lo(n1n0 >> 1) + srl %o1,1,%o1 C %o1 = hi(n1n0 >> 1) + + and %o3,1,%g2 + srl %o3,1,%g3 C %g3 = floor(d / 2) + add %g3,%g2,%g3 C %g3 = ceil(d / 2) + + b L(Lp1) + addxcc %o2,%o2,%o2 + +L(Lplop): + bcc L(Ln1) + addxcc %o2,%o2,%o2 +L(Lp1): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln2) + addxcc %o2,%o2,%o2 +L(Lp2): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln3) + addxcc %o2,%o2,%o2 +L(Lp3): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln4) + addxcc %o2,%o2,%o2 +L(Lp4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(Lplop) + subcc %o1,%g3,%o4 + bcc L(Ln5) + addxcc %o2,%o2,%o2 +L(Lp5): add %o1,%o1,%o1 C << 1 + tst %g2 + bne L(oddp) + add %o5,%o1,%o1 + st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(Lnlop): + bcc L(Lp1) + addxcc %o2,%o2,%o2 +L(Ln1): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp2) + addxcc %o2,%o2,%o2 +L(Ln2): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp3) + addxcc %o2,%o2,%o2 +L(Ln3): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp4) + addxcc %o2,%o2,%o2 +L(Ln4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(Lnlop) + subcc %o4,%g3,%o1 + bcc L(Lp5) + addxcc %o2,%o2,%o2 +L(Ln5): add %o4,%o4,%o4 C << 1 + tst %g2 + bne L(oddn) + add %o5,%o4,%o4 + st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(oddp): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o1 + addcc %o1,%o2,%o1 + bcc L(Lp6) + addx %o2,0,%o2 + sub %o1,%o3,%o1 +L(Lp6): subcc %o1,%o3,%g0 + bcs L(Lp7) + subx %o2,-1,%o2 + sub %o1,%o3,%o1 +L(Lp7): st %o1,[%o0] + retl + mov %o2,%o0 + +L(oddn): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o4 + addcc %o4,%o2,%o4 + bcc L(Ln6) + addx %o2,0,%o2 + sub %o4,%o3,%o4 +L(Ln6): subcc %o4,%o3,%g0 + bcs L(Ln7) + subx %o2,-1,%o2 + sub %o4,%o3,%o4 +L(Ln7): st %o4,[%o0] + retl + mov %o2,%o0 +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm new file mode 100644 index 0000000..c781596 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm @@ -0,0 +1,70 @@ +dnl SPARC T1 32-bit mpn_add_n. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', %o0) +define(`ap', %o1) +define(`bp', %o2) +define(`n', %o3) +define(`cy', %o4) + +define(`i', %o3) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) + +ASM_START() +PROLOGUE(mpn_add_nc) + b L(ent) + srl cy, 0, cy C strip any bogus high bits +EPILOGUE() + +PROLOGUE(mpn_add_n) + mov 0, cy +L(ent): srl n, 0, n C strip any bogus high bits + sll n, 2, n + add ap, n, ap + add bp, n, bp + add rp, n, rp + neg n, i + +L(top): lduw [ap+i], %g1 + lduw [bp+i], %g2 + add %g1, %g2, %g3 + add %g3, cy, %g3 + stw %g3, [rp+i] + add i, 4, i + brnz i, L(top) + srlx %g3, 32, cy + + retl + mov cy, %o0 C return value +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm new file mode 100644 index 0000000..89da186 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm @@ -0,0 +1,90 @@ +dnl SPARC T1 32-bit mpn_addmul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 24 +C UltraSPARC T2: 19 +C UltraSPARC T3: 19 +C UltraSPARC T4: 5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() +PROLOGUE(mpn_addmul_1) + save %sp, -96, %sp + srl n, 0, %o4 + srl v0, 0, %g1 + subcc %o4, 1, %o4 + be L(final_one) + clr %o5 + +L(top): lduw [up+0], %l0 + lduw [rp+0], %l2 + lduw [up+4], %l1 + lduw [rp+4], %l3 + mulx %l0, %g1, %g3 + add up, 8, up + mulx %l1, %g1, %o3 + sub %o4, 2, %o4 + add rp, 8, rp + add %l2, %g3, %g3 + add %o5, %g3, %g3 + stw %g3, [rp-8] + srlx %g3, 32, %o5 + add %l3, %o3, %o3 + add %o5, %o3, %o3 + stw %o3, [rp-4] + brgz %o4, L(top) + srlx %o3, 32, %o5 + + brlz,pt %o4, L(done) + nop + +L(final_one): + lduw [up+0], %l0 + lduw [rp+0], %l2 + mulx %l0, %g1, %g3 + add %l2, %g3, %g3 + add %o5, %g3, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %o5 + +L(done): + ret + restore %o5, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h new file mode 100644 index 0000000..6f9d5a4 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h @@ -0,0 +1,153 @@ +/* UltraSPARC T 32-bit gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 98 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 110 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 240 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 280 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 280, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 21, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \ + { 49, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 79, 9}, { 159, 8}, { 319,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 66 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 70 +#define SQR_FFT_THRESHOLD 2624 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 51 +#define DC_DIVAPPR_Q_THRESHOLD 202 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 124 + +#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_NEWTON_THRESHOLD 266 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_N_THRESHOLD 59 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1499 +#define MUPI_DIV_QR_THRESHOLD 116 +#define MU_BDIV_QR_THRESHOLD 1057 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define POWM_SEC_TABLE 6,35,213,724,2618 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 84 +#define HGCD_APPR_THRESHOLD 101 +#define HGCD_REDUCE_THRESHOLD 1437 +#define GCD_DC_THRESHOLD 372 +#define GCDEXT_DC_THRESHOLD 253 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 399 +#define SET_STR_PRECOMPUTE_THRESHOLD 885 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 29 diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm new file mode 100644 index 0000000..0239cd2 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm @@ -0,0 +1,83 @@ +dnl SPARC T1 32-bit mpn_mul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 20 +C UltraSPARC T2: 18 +C UltraSPARC T3: 18 +C UltraSPARC T4: 4 + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`n', `%o2') +define(`v0', `%o3') + +ASM_START() +PROLOGUE(mpn_mul_1) + srl n, 0, n + srl v0, 0, v0 + subcc n, 1, n + be L(final_one) + clr %o5 + +L(top): lduw [up+0], %g1 + lduw [up+4], %g2 + mulx %g1, v0, %g3 + add up, 8, up + mulx %g2, v0, %o4 + sub n, 2, n + add rp, 8, rp + add %o5, %g3, %g3 + stw %g3, [rp-8] + srlx %g3, 32, %o5 + add %o5, %o4, %o4 + stw %o4, [rp-4] + brgz n, L(top) + srlx %o4, 32, %o5 + + brlz,pt n, L(done) + nop + +L(final_one): + lduw [up+0], %g1 + mulx %g1, v0, %g3 + add %o5, %g3, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %o5 + +L(done): + retl + mov %o5, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm new file mode 100644 index 0000000..3b906ef --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm @@ -0,0 +1,55 @@ +dnl SPARC T1 32-bit mpn_sqr_diagonal. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`n', `%o2') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + deccc n C n-- + nop + +L(top): lduw [up+0], %g1 + add up, 4, up C up++ + mulx %g1, %g1, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %g4 + stw %g4, [rp+4] + add rp, 8, rp C rp += 2 + bnz %icc, L(top) + deccc n C n-- + + retl + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm new file mode 100644 index 0000000..946bc3f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm @@ -0,0 +1,70 @@ +dnl SPARC T1 32-bit mpn_sub_n. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', %o0) +define(`ap', %o1) +define(`bp', %o2) +define(`n', %o3) +define(`cy', %o4) + +define(`i', %o3) + +MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(mpn_sub_nc) + b L(ent) + srl cy, 0, cy C strip any bogus high bits +EPILOGUE() + +PROLOGUE(mpn_sub_n) + mov 0, cy +L(ent): srl n, 0, n C strip any bogus high bits + sll n, 2, n + add ap, n, ap + add bp, n, bp + add rp, n, rp + neg n, i + +L(top): lduw [ap+i], %g1 + lduw [bp+i], %g2 + sub %g1, %g2, %g3 + sub %g3, cy, %g3 + stw %g3, [rp+i] + add i, 4, i + brnz i, L(top) + srlx %g3, 63, cy + + retl + mov cy, %o0 C return value +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm new file mode 100644 index 0000000..8920070 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm @@ -0,0 +1,91 @@ +dnl SPARC T1 32-bit mpn_submul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 24 +C UltraSPARC T2: 19 +C UltraSPARC T3: 19 +C UltraSPARC T4: 5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() +PROLOGUE(mpn_submul_1) + save %sp, -96, %sp + srl n, 0, %o4 + srl v0, 0, %g1 + subcc %o4, 1, %o4 + be L(final_one) + subcc %g0, 0, %o5 + +L(top): lduw [up+0], %l0 + lduw [rp+0], %l2 + lduw [up+4], %l1 + lduw [rp+4], %l3 + mulx %l0, %g1, %g3 + add up, 8, up + mulx %l1, %g1, %o3 + sub %o4, 2, %o4 + add rp, 8, rp + addx %o5, %g3, %g3 + srlx %g3, 32, %o5 + subcc %l2, %g3, %g3 + stw %g3, [rp-8] + addx %o5, %o3, %o3 + srlx %o3, 32, %o5 + subcc %l3, %o3, %o3 + brgz %o4, L(top) + stw %o3, [rp-4] + + brlz,pt %o4, L(done) + nop + +L(final_one): + lduw [up+0], %l0 + lduw [rp+0], %l2 + mulx %l0, %g1, %g3 + addx %o5, %g3, %g3 + srlx %g3, 32, %o5 + subcc %l2, %g3, %g3 + stw %g3, [rp+0] + +L(done): + addx %o5, 0, %o5 + ret + restore %o5, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/umul.asm b/gmp-6.3.0/mpn/sparc32/umul.asm new file mode 100644 index 0000000..3a20b95 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/umul.asm @@ -0,0 +1,77 @@ +dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + wr %g0,%o1,%y + sra %o2,31,%g2 C Don't move this insn + and %o1,%g2,%g2 C Don't move this insn + andcc %g0,0,%g1 C Don't move this insn + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,0,%g1 + rd %y,%g3 + st %g3,[%o0] + retl + add %g1,%g2,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm new file mode 100644 index 0000000..0bf1b24 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm @@ -0,0 +1,109 @@ +dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1992-1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + ld [%o1+0],%o4 + andcc %o2,1,%g0 + be L(bx0) + andcc %o2,2,%g0 +L(bx1): be L(01) + orcc %g0,%g0,%g2 +L(b11): add %o0,-8,%o0 + b L(11) + add %o1,-8,%o1 +L(bx0): be L(b00) + orcc %g0,%g0,%g2 +L(b10): add %o0,-12,%o0 + b L(10) + add %o1,4,%o1 +L(b00): add %o0,-4,%o0 + b L(00) + add %o1,-4,%o1 + +L(top): addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + rd %y,%g2 C 1 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 1 +L(00): umul %o4,%o3,%g3 C 2 + ld [%o0+4],%g1 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + rd %y,%g2 C 2 + addx %g0,%g2,%g2 + nop + addcc %g1,%g3,%g3 + st %g3,[%o0+4] C 2 +L(11): umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + rd %y,%g2 C 3 + add %o1,16,%o1 + addx %g0,%g2,%g2 + ld [%o0+8],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+8] C 3 +L(10): umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+12],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + addx %g0,%g2,%g2 +L(01): addcc %o2,-4,%o2 + bg L(top) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h new file mode 100644 index 0000000..e57897b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h @@ -0,0 +1,73 @@ +/* SPARC v8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-07, gcc 2.95 */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 65 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 65 + +#define DIV_SB_PREINV_THRESHOLD 5 +#define DIV_DC_THRESHOLD 24 +#define POWM_THRESHOLD 38 + +#define HGCD_THRESHOLD 69 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 498 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 6 +#define DIVREM_1_UNNORM_THRESHOLD 11 +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 4 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_THRESHOLD 1679 + +#define MUL_FFT_TABLE { 272, 672, 1152, 2560, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 1792 + +#define SQR_FFT_TABLE { 304, 672, 1152, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 264 +#define SQR_FFT_THRESHOLD 1728 diff --git a/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm new file mode 100644 index 0000000..d03a0e6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm @@ -0,0 +1,93 @@ +dnl SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and +dnl store the product in a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + ld [%o1+0],%o4 + andcc %o2,1,%g0 + be L(bx0) + andcc %o2,2,%g0 +L(bx1): be L(01) + orcc %g0,%g0,%g2 +L(b11): add %o0,-8,%o0 + b L(11) + add %o1,-8,%o1 +L(bx0): be L(b00) + orcc %g0,%g0,%g2 +L(b10): add %o0,-12,%o0 + b L(10) + add %o1,4,%o1 +L(b00): add %o0,-4,%o0 + b L(00) + add %o1,-4,%o1 + +L(top): addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + st %g3,[%o0+0] C 1 + rd %y,%g2 C 1 +L(00): umul %o4,%o3,%g3 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + st %g3,[%o0+4] C 2 + rd %y,%g2 C 2 +L(11): umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + add %o1,16,%o1 + st %g3,[%o0+8] C 3 + rd %y,%g2 C 3 +L(10): umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 +L(01): addcc %o2,-4,%o2 + bg L(top) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + st %g3,[%o0+0] C 4 + rd %y,%g2 C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm new file mode 100644 index 0000000..187314e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm @@ -0,0 +1,67 @@ +dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + sub %g0,%o2,%o2 C negate ... + sll %o2,2,%o2 C ... and scale size + sub %o1,%o2,%o1 C o1 is offset s1_ptr + sub %o0,%o2,%g1 C g1 is offset res_ptr + + mov 0,%o0 C clear cy_limb + +L(loop): + ld [%o1+%o2],%o4 + ld [%g1+%o2],%g2 + umul %o4,%o3,%o5 + rd %y,%g3 + addcc %o5,%o0,%o5 + addx %g3,0,%o0 + subcc %g2,%o5,%g2 + addx %o0,0,%o0 + st %g2,[%g1+%o2] + + addcc %o2,4,%o2 + bne L(loop) + nop + + retl + nop +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h new file mode 100644 index 0000000..1ac9239 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h @@ -0,0 +1,73 @@ +/* SuperSPARC gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-10, gcc 3.3 */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 81 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 86 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 26 +#define POWM_THRESHOLD 79 + +#define HGCD_THRESHOLD 97 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 470 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 34 +#define SET_STR_THRESHOLD 3524 + +#define MUL_FFT_TABLE { 304, 800, 1408, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 2304 + +#define SQR_FFT_TABLE { 336, 800, 1408, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 280 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm b/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm new file mode 100644 index 0000000..12f66ce --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm @@ -0,0 +1,131 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/v8/udiv.asm b/gmp-6.3.0/mpn/sparc32/v8/udiv.asm new file mode 100644 index 0000000..12f66ce --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/udiv.asm @@ -0,0 +1,131 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/v8/umul.asm b/gmp-6.3.0/mpn/sparc32/v8/umul.asm new file mode 100644 index 0000000..1a2e84b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/umul.asm @@ -0,0 +1,40 @@ +dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + umul %o1,%o2,%g2 + st %g2,[%o0] + retl + rd %y,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/sparc32/v9/README b/gmp-6.3.0/mpn/sparc32/v9/README new file mode 100644 index 0000000..9b39713 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/README @@ -0,0 +1,4 @@ +Code for SPARC processors implementing version 9 of the SPARC architecture. +This code is for systems that doesn't preserve the full 64-bit contents of +integer register at context switch. For other systems (such as Solaris 7 or +later) use the code in ../../sparc64. diff --git a/gmp-6.3.0/mpn/sparc32/v9/add_n.asm b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm new file mode 100644 index 0000000..7bd5974 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_add_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + add %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + add %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + add %o4,%o5,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 + +L(exito): + add %g2,%g3,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp-4] + add %o4,%o5,%g4 + add cy,%g4,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end1): + add %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end2): + add %o4,%o5,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm new file mode 100644 index 0000000..2adf7a8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm @@ -0,0 +1,306 @@ +dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_addmul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h new file mode 100644 index 0000000..f909e2c --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h @@ -0,0 +1,204 @@ +/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 25000000 */ +/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 32 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 43 +#define MUL_TOOM44_THRESHOLD 126 +#define MUL_TOOM6H_THRESHOLD 161 +#define MUL_TOOM8H_THRESHOLD 208 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 80 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 55 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 72 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 64 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 152 +#define SQR_TOOM6_THRESHOLD 185 +#define SQR_TOOM8_THRESHOLD 324 + +#define MULMID_TOOM42_THRESHOLD 64 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 288 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 288, 5}, { 9, 4}, { 19, 5}, { 11, 6}, \ + { 6, 5}, { 14, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 20, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \ + { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \ + { 39, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 415, 8}, \ + { 831,12}, { 63,11}, { 127,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895, 8}, { 1791,12}, { 127,11}, \ + { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \ + { 319, 9}, { 1279,11}, { 351,12}, { 191,11}, \ + { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \ + { 1791,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 703,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 703,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1407,11}, { 2943,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 143 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 71, 8}, { 143, 7}, { 287, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351, 7}, \ + { 703,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207, 8}, { 415, 9}, { 223,11}, { 63,10}, \ + { 127, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 351, 9}, { 703, 8}, \ + { 1407,11}, { 191,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 447, 9}, { 895,10}, { 479,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895, 9}, { 1791,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,10}, { 1919, 9}, { 3839,13}, { 255,12}, \ + { 575,11}, { 1151,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 153 +#define SQR_FFT_THRESHOLD 2112 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 144 +#define MULLO_MUL_N_THRESHOLD 4292 + +#define DC_DIV_QR_THRESHOLD 74 +#define DC_DIVAPPR_Q_THRESHOLD 406 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 363 + +#define INV_MULMOD_BNM1_THRESHOLD 108 +#define INV_NEWTON_THRESHOLD 351 +#define INV_APPR_THRESHOLD 303 + +#define BINV_NEWTON_THRESHOLD 354 +#define REDC_1_TO_REDC_N_THRESHOLD 61 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 1099 +#define MUPI_DIV_QR_THRESHOLD 118 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 979 + +#define POWM_SEC_TABLE 3,22,127,624,779,2351 + +#define MATRIX22_STRASSEN_THRESHOLD 7 +#define HGCD_THRESHOLD 90 +#define HGCD_APPR_THRESHOLD 123 +#define HGCD_REDUCE_THRESHOLD 1494 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 290 +#define SET_STR_PRECOMPUTE_THRESHOLD 634 + +#define FAC_DSC_THRESHOLD 156 +#define FAC_ODD_THRESHOLD 25 diff --git a/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm new file mode 100644 index 0000000..40aeffa --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm @@ -0,0 +1,287 @@ +dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_mul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + srlx %g4, 32, %g3 C new cy + add %o1, 4, %o1 C up++ + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + stw %g4, [%o0-4] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm new file mode 100644 index 0000000..e024279 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm @@ -0,0 +1,462 @@ +dnl SPARC v9 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 + +C This code uses a very deep software pipeline, due to the need for moving data +C forth and back between the integer registers and floating-point registers. +C +C A VIS variant of this code would make the pipeline less deep, since the +C masking now done in the integer unit could take place in the floating-point +C unit using the FAND instruction. It would be possible to save several cycles +C too. +C +C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and +C not much slower from the Ecache. It would perhaps be possible to shave off +C one cycle, but not easily. We cannot do better than 10 cycles/limb with the +C used instructions, since we have 10 memory operations per limb. But a VIS +C variant could run three cycles faster than the corresponding non-VIS code. + +C This is non-pipelined code showing the algorithm: +C +C .Loop: +C lduw [up+0],%g4 C 00000000hhhhllll +C sllx %g4,16,%g3 C 0000hhhhllll0000 +C or %g3,%g4,%g2 C 0000hhhhXXXXllll +C andn %g2,%g5,%g2 C 0000hhhh0000llll +C stx %g2,[%fp+80] +C ldd [%fp+80],%f0 +C fitod %f0,%f4 C hi16 +C fitod %f1,%f6 C lo16 +C ld [up+0],%f9 +C fxtod %f8,%f2 +C fmuld %f2,%f4,%f4 +C fmuld %f2,%f6,%f6 +C fdtox %f4,%f4 +C fdtox %f6,%f6 +C std %f4,[%fp-24] +C std %f6,[%fp-16] +C ldx [%fp-24],%g2 +C ldx [%fp-16],%g1 +C sllx %g2,16,%g2 +C add %g2,%g1,%g1 +C stw %g1,[rp+0] +C srlx %g1,32,%l0 +C stw %l0,[rp+4] +C add up,4,up +C subcc n,1,n +C bne,pt %icc,.Loop +C add rp,8,rp + +define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe + +ASM_START() + + TEXT + ALIGN(4) +.Lnoll: + .word 0 + +PROLOGUE(mpn_sqr_diagonal) + save %sp,-256,%sp + +ifdef(`PIC', +`.Lpc: rd %pc,%o7 + ld [%o7+.Lnoll-.Lpc],%f8', +` sethi %hi(.Lnoll),%g1 + ld [%g1+%lo(.Lnoll)],%f8') + + sethi %hi(0xffff0000),%g5 + add %i1,-8,%i1 + + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_1 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + add %i1,4,%i1 C s1_ptr++ + stx %g2,[%fp+80] + ld [%i1],%f9 + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + std %f6,[%fp-16] + + add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .L1 + add %fp, -40, %l6 + +.L_grt_1: + stx %g2,[%fp+80] + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_2 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + + add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + b .L2 + add %fp, -24, %l6 + +.L_grt_2: + stx %g2,[%fp+72] + lduw [%i1+8],%g4 + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + fxtod %f8,%f2 + bne,pt %icc,.L_grt_3 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+80] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + add %fp, 80, %l3 + fmuld %f2,%f6,%f6 + add %fp, -24, %l4 + ldd [%fp+80],%f0 + add %fp, 72, %l5 + fdtox %f4,%f4 + b .L3 + add %fp, -40, %l6 + +.L_grt_3: + stx %g2,[%fp+80] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + bne,pt %icc,.L_grt_4 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + fitod %f0,%f4 + fitod %f1,%f6 + add %fp, 72, %l3 + fmuld %f2,%f4,%f4 + add %fp, -40, %l4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + add %fp, 80, %l5 + fdtox %f4,%f4 + b .L4 + add %fp, -24, %l6 + +.L_grt_4: + stx %g2,[%fp+72] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 + std %f6,[%fp-32] + be,pn %icc,.L5 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + b,a .Loop + + .align 16 +C --- LOOP BEGIN +.Loop: nop + nop + stx %g2,[%fp+80] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-24],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 +C --- + std %f6,[%fp-16] + andn %g2,%g5,%g2 C 0000hhhh0000llll + be,pn %icc,.Lend + fanop +C --- LOOP MIDDLE + nop + nop + stx %g2,[%fp+72] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-40],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 +C --- + std %f6,[%fp-32] + andn %g2,%g5,%g2 C 0000hhhh0000llll + bne,pt %icc,.Loop + fanop +C --- LOOP END + +.L5: add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .Ltail + add %fp, -40, %l6 + +.Lend: add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + add %fp, -24, %l6 +.Ltail: stx %g2,[%l3] + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%l5],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L4: fdtox %f6,%f6 + std %f4,[%l4] + fxtod %f8,%f2 + std %f6,[%l4+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + ldd [%l3],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L3: fdtox %f6,%f6 + std %f4,[%l6] + fxtod %f8,%f2 + std %f6,[%l6+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L2: fdtox %f6,%f6 + std %f4,[%l4] + std %f6,[%l4+8] + + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + +.L1: ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + + ret + restore %g0,%g0,%o0 + +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm new file mode 100644 index 0000000..636c73b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_sub_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + sub %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + sub %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 + +L(exito): + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp-4] + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end1): + sub %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end2): + sub %o4,%o5,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm new file mode 100644 index 0000000..92d0ce7 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm @@ -0,0 +1,316 @@ +dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_submul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: sub %g0, %g3, %g3 + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: sub %g0, %g3, %g3 + fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L1: sllx %g2, 16, %g4 C (p16 << 16) + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + sub %g0, %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/udiv.asm b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm new file mode 100644 index 0000000..61dde97 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm @@ -0,0 +1,52 @@ +dnl SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + sllx %o1, 32, %g1 C shift upper dividend limb + srl %o2, 0, %g2 C zero extend lower dividend limb + srl %o3, 0, %g3 C zero extend divisor + or %g2, %g1, %g1 C assemble 64-bit dividend + udivx %g1, %g3, %g1 + mulx %g1, %g3, %g4 + sub %g2, %g4, %g2 + st %g2, [%o0] C store remainder + retl + mov %g1, %o0 C return quotient +EPILOGUE(mpn_udiv_qrnnd) -- cgit v1.2.3