From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/sparc64/README | 125 +++++ gmp-6.3.0/mpn/sparc64/copyd.asm | 89 +++ gmp-6.3.0/mpn/sparc64/copyi.asm | 86 +++ gmp-6.3.0/mpn/sparc64/dive_1.c | 161 ++++++ gmp-6.3.0/mpn/sparc64/divrem_1.c | 242 ++++++++ gmp-6.3.0/mpn/sparc64/gcd_11.asm | 87 +++ gmp-6.3.0/mpn/sparc64/gmp-mparam.h | 139 +++++ gmp-6.3.0/mpn/sparc64/lshift.asm | 140 +++++ gmp-6.3.0/mpn/sparc64/lshiftc.asm | 147 +++++ gmp-6.3.0/mpn/sparc64/mod_1.c | 238 ++++++++ gmp-6.3.0/mpn/sparc64/mod_1_4.c | 235 ++++++++ gmp-6.3.0/mpn/sparc64/mode1o.c | 196 +++++++ gmp-6.3.0/mpn/sparc64/rshift.asm | 142 +++++ gmp-6.3.0/mpn/sparc64/sec_tabselect.asm | 162 ++++++ gmp-6.3.0/mpn/sparc64/sparc64.h | 217 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm | 241 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm | 606 +++++++++++++++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm | 551 +++++++++++++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm | 165 ++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm | 580 ++++++++++++++++++++ .../mpn/sparc64/ultrasparc1234/sqr_diagonal.asm | 342 ++++++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm | 241 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm | 68 +++ gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h | 222 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm | 68 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm | 69 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm | 86 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h | 154 ++++++ gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm | 82 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm | 69 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm | 68 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm | 41 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm | 69 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm | 86 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm | 126 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm | 182 +++++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm | 228 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm | 219 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm | 147 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm | 147 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm | 137 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm | 145 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm | 129 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm | 78 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm | 92 ++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm | 77 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 | 88 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm | 233 ++++++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm | 117 ++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm | 82 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm | 174 ++++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm | 70 +++ .../mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm | 93 ++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm | 144 +++++ gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm | 170 ++++++ gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h | 174 ++++++ 61 files changed, 9471 insertions(+) create mode 100644 gmp-6.3.0/mpn/sparc64/README create mode 100644 gmp-6.3.0/mpn/sparc64/copyd.asm create mode 100644 gmp-6.3.0/mpn/sparc64/copyi.asm create mode 100644 gmp-6.3.0/mpn/sparc64/dive_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/divrem_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/sparc64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/lshift.asm create mode 100644 gmp-6.3.0/mpn/sparc64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/sparc64/mod_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/mod_1_4.c create mode 100644 gmp-6.3.0/mpn/sparc64/mode1o.c create mode 100644 gmp-6.3.0/mpn/sparc64/rshift.asm create mode 100644 gmp-6.3.0/mpn/sparc64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/sparc64/sparc64.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h (limited to 'gmp-6.3.0/mpn/sparc64') diff --git a/gmp-6.3.0/mpn/sparc64/README b/gmp-6.3.0/mpn/sparc64/README new file mode 100644 index 0000000..e2c051a --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/README @@ -0,0 +1,125 @@ +Copyright 1997, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +Notation: + IANY = shift/add/sub/logical/sethi + IADDLOG = add/sub/logical/sethi + MEM = ld*/st* + FA = fadd*/fsub*/f*to*/fmov* + FM = fmul* + +UltraSPARC can issue four instructions per cycle, with these restrictions: +* Two IANY instructions, but only one of these may be a shift. If there is a + shift and an IANY instruction, the shift must precede the IANY instruction. +* One FA. +* One FM. +* One branch. +* One MEM. +* IANY/IADDLOG/MEM must be insn 1, 2, or 3 in an issue bundle. Taken branches + should not be in slot 4, since that makes the delay insn come from separate + bundle. +* If two IANY/IADDLOG instructions are to be executed in the same cycle and one + of these is setting the condition codes, that instruction must be the second + one. + +To summarize, ignoring branches, these are the bundles that can reach the peak +execution speed: + +insn1 iany iany mem iany iany mem iany iany mem +insn2 iaddlog mem iany mem iaddlog iany mem iaddlog iany +insn3 mem iaddlog iaddlog fa fa fa fm fm fm +insn4 fa/fm fa/fm fa/fm fm fm fm fa fa fa + +The 64-bit integer multiply instruction mulx takes from 5 cycles to 35 cycles, +depending on the position of the most significant bit of the first source +operand. When used for 32x32->64 multiplication, it needs 20 cycles. +Furthermore, it stalls the processor while executing. We stay away from that +instruction, and instead use floating-point operations. + +Floating-point add and multiply units are fully pipelined. The latency for +UltraSPARC-1/2 is 3 cycles and for UltraSPARC-3 it is 4 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions. No conditional move can issue 1-5 cycles after a load. (This +might have been fixed for UltraSPARC-3.) + +The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is +somewhat slower. Branches execute slower, and there may be other new stalls. +But integer multiply doesn't stall the entire CPU and also has a much lower +latency. But it's still not pipelined, and thus useless for our needs. + +STATUS + +* mpn_lshift, mpn_rshift: The current code runs at 2.0 cycles/limb on + UltraSPARC-1/2 and 2.65 on UltraSPARC-3. For UltraSPARC-1/2, the IEU0 + functional unit is saturated with shifts. + +* mpn_add_n, mpn_sub_n: The current code runs at 4 cycles/limb on + UltraSPARC-1/2 and 4.5 cycles/limb on UltraSPARC-3. The 4 instruction + recurrency is the speed limiter. + +* mpn_addmul_1: The current code runs at 14 cycles/limb asymptotically on + UltraSPARC-1/2 and 17.5 cycles/limb on UltraSPARC-3. On UltraSPARC-1/2, the + code sustains 4 instructions/cycle. It might be possible to invent a better + way of summing the intermediate 49-bit operands, but it is unlikely that it + will save enough instructions to save an entire cycle. + + The load-use of the u operand is not enough scheduled for good L2 cache + performance. The UltraSPARC-1/2 L1 cache is direct mapped, and since we use + temporary stack slots that will conflict with the u and r operands, we miss + to L2 very often. The load-use of the std/ldx pairs via the stack are + perhaps over-scheduled. + + It would be possible to save two instructions: (1) The mov could be avoided + if the std/ldx were less scheduled. (2) The ldx of the r operand could be + split into two ld instructions, saving the shifts/masks. + + It should be possible to reach 14 cycles/limb for UltraSPARC-3 if the fp + operations where rescheduled for this processor's 4-cycle latency. + +* mpn_mul_1: The current code is a straightforward edit of the mpn_addmul_1 + code. It would be possible to shave one or two cycles from it, with some + labour. + +* mpn_submul_1: Simpleminded code just calling mpn_mul_1 + mpn_sub_n. This + means that it runs at 18 cycles/limb on UltraSPARC-1/2 and 23 cycles/limb on + UltraSPARC-3. It would be possible to either match the mpn_addmul_1 + performance, or in the worst case use one more instruction group. + +* US1/US2 cache conflict resolving. The direct mapped L1 date cache of US1/US2 + is a problem for mul_1, addmul_1 (and a prospective submul_1). We should + allocate a larger cache area, and put the stack temp area in a place that + doesn't cause cache conflicts. diff --git a/gmp-6.3.0/mpn/sparc64/copyd.asm b/gmp-6.3.0/mpn/sparc64/copyd.asm new file mode 100644 index 0000000..ab105d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/copyd.asm @@ -0,0 +1,89 @@ +dnl SPARC v9 mpn_copyd -- Copy a limb vector, decrementing. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17 +C UltraSPARC T3: 6 +C UltraSPARC T4/T5: 2 + +C INPUT PARAMETERS +C rptr %o0 +C sptr %o1 +C n %o2 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyd) + sllx %o2,3,%g1 + add %g1,%o0,%o0 + add %g1,%o1,%o1 + addcc %o2,-8,%o2 + bl,pt %xcc,L(end01234567) + nop +L(loop1): + ldx [%o1-8],%g1 + ldx [%o1-16],%g2 + ldx [%o1-24],%g3 + ldx [%o1-32],%g4 + ldx [%o1-40],%g5 + ldx [%o1-48],%o3 + ldx [%o1-56],%o4 + ldx [%o1-64],%o5 + add %o1,-64,%o1 + stx %g1,[%o0-8] + stx %g2,[%o0-16] + stx %g3,[%o0-24] + stx %g4,[%o0-32] + stx %g5,[%o0-40] + stx %o3,[%o0-48] + stx %o4,[%o0-56] + stx %o5,[%o0-64] + addcc %o2,-8,%o2 + bge,pt %xcc,L(loop1) + add %o0,-64,%o0 +L(end01234567): + addcc %o2,8,%o2 + bz,pn %xcc,L(end) + nop +L(loop2): + ldx [%o1-8],%g1 + add %o1,-8,%o1 + addcc %o2,-1,%o2 + stx %g1,[%o0-8] + bg,pt %xcc,L(loop2) + add %o0,-8,%o0 +L(end): retl + nop +EPILOGUE(mpn_copyd) diff --git a/gmp-6.3.0/mpn/sparc64/copyi.asm b/gmp-6.3.0/mpn/sparc64/copyi.asm new file mode 100644 index 0000000..45663dc --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/copyi.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_copyi -- Copy a limb vector, incrementing. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17 +C UltraSPARC T3: 6 +C UltraSPARC T4/T5: 2 + +C INPUT PARAMETERS +C rptr %o0 +C sptr %o1 +C n %o2 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyi) + addcc %o2,-8,%o2 + bl,pt %xcc,L(end01234567) + nop +L(loop1): + ldx [%o1+0],%g1 + ldx [%o1+8],%g2 + ldx [%o1+16],%g3 + ldx [%o1+24],%g4 + ldx [%o1+32],%g5 + ldx [%o1+40],%o3 + ldx [%o1+48],%o4 + ldx [%o1+56],%o5 + add %o1,64,%o1 + stx %g1,[%o0+0] + stx %g2,[%o0+8] + stx %g3,[%o0+16] + stx %g4,[%o0+24] + stx %g5,[%o0+32] + stx %o3,[%o0+40] + stx %o4,[%o0+48] + stx %o5,[%o0+56] + addcc %o2,-8,%o2 + bge,pt %xcc,L(loop1) + add %o0,64,%o0 +L(end01234567): + addcc %o2,8,%o2 + bz,pn %xcc,L(end) + nop +L(loop2): + ldx [%o1+0],%g1 + add %o1,8,%o1 + addcc %o2,-1,%o2 + stx %g1,[%o0+0] + bg,pt %xcc,L(loop2) + add %o0,8,%o0 +L(end): retl + nop +EPILOGUE(mpn_copyi) diff --git a/gmp-6.3.0/mpn/sparc64/dive_1.c b/gmp-6.3.0/mpn/sparc64/dive_1.c new file mode 100644 index 0000000..4264f29 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/dive_1.c @@ -0,0 +1,161 @@ +/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: 110 70 +*/ + + +/* There are two key ideas here to reduce mulx's. Firstly when the divisor + is 32-bits the high of q*d can be calculated without the two 32x32->64 + cross-products involving the high 32-bits of the divisor, that being zero + of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save + one mulx (each) knowing the low of q*d is equal to the input limb l. + + For size==1, a simple udivx is used. This is faster than calculating an + inverse. + + For a 32-bit divisor and small sizes, an attempt was made at a simple + udivx loop (two per 64-bit limb), but it turned out to be slower than + mul-by-inverse. At size==2 the inverse is about 260 cycles total + compared to a udivx at 291. Perhaps the latter would suit when size==2 + but the high 32-bits of the second limb is zero (saving one udivx), but + it doesn't seem worth a special case just for that. */ + +void +mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) +{ + mp_limb_t inverse, s, s_next, c, l, ls, q; + unsigned rshift, lshift; + mp_limb_t lshift_mask; + mp_limb_t divisor_h; + + ASSERT (size >= 1); + ASSERT (divisor != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); + ASSERT_MPN (src, size); + ASSERT_LIMB (divisor); + + s = *src++; /* src low limb */ + size--; + if (size == 0) + { + *dst = s / divisor; + return; + } + + if ((divisor & 1) == 0) + { + count_trailing_zeros (rshift, divisor); + divisor >>= rshift; + lshift = 64 - rshift; + + lshift_mask = MP_LIMB_T_MAX; + } + else + { + rshift = 0; + + /* rshift==0 means no shift, so must mask out other part in this case */ + lshift = 0; + lshift_mask = 0; + } + + binvert_limb (inverse, divisor); + + c = 0; + divisor_h = HIGH32 (divisor); + + if (divisor_h == 0) + { + /* 32-bit divisor */ + do + { + s_next = *src++; + ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + q = l * inverse; + *dst++ = q; + + umul_ppmm_half_lowequal (l, q, divisor, l); + c += l; + + size--; + } + while (size != 0); + + ls = s >> rshift; + l = ls - c; + q = l * inverse; + *dst = q; + } + else + { + /* 64-bit divisor */ + mp_limb_t divisor_l = LOW32 (divisor); + do + { + s_next = *src++; + ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + q = l * inverse; + *dst++ = q; + + umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); + c += l; + + size--; + } + while (size != 0); + + ls = s >> rshift; + l = ls - c; + q = l * inverse; + *dst = q; + } +} diff --git a/gmp-6.3.0/mpn/sparc64/divrem_1.c b/gmp-6.3.0/mpn/sparc64/divrem_1.c new file mode 100644 index 0000000..ac94565 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/divrem_1.c @@ -0,0 +1,242 @@ +/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division. + +Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + integer fraction integer fraction + Ultrasparc 2i: 160 160 122 96 +*/ + + +/* 32-bit divisors are treated in special case code. This requires 4 mulx + per limb instead of 8 in the general case. + + For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] + addressing, to get the two halves of each limb read in the correct order. + This is kept in an adj variable. Doing that measures about 4 c/l faster + than just writing HALF_ENDIAN_ADJ(i) in the integer loop. The latter + shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well + (on gcc 3.2.1 at least). The fraction loop doesn't seem affected, but we + still use a variable since that ought to work out best. */ + +mp_limb_t +mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs, + mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ + mp_size_t total_size_limbs; + mp_size_t i; + + ASSERT (xsize_limbs >= 0); + ASSERT (size_limbs >= 0); + ASSERT (d_limb != 0); + /* FIXME: What's the correct overlap rule when xsize!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs, + ap_limbptr, size_limbs)); + + total_size_limbs = size_limbs + xsize_limbs; + if (UNLIKELY (total_size_limbs == 0)) + return 0; + + /* udivx is good for total_size==1, and no need to bother checking + limb 0); /* because always even */ + qp[size + HALF_ENDIAN_ADJ(1)] = 0; + } + + /* Skip a division if high < divisor (high quotient 0). Testing + here before before normalizing will still skip as often as + possible. */ + if (n1 < d_limb) + { + r = n1; + size--; + qp[size + HALF_ENDIAN_ADJ(size)] = 0; + total_size--; + if (total_size == 0) + return r; + } + } + + count_leading_zeros_32 (norm, d_limb); + norm -= 32; + d_limb <<= norm; + r <<= norm; + + norm_rshift = 32 - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + + invert_half_limb (dinv_limb, d_limb); + + if (LIKELY (size != 0)) + { + i = size - 1; + adj = HALF_ENDIAN_ADJ (i); + n1 = ap[i + adj]; + adj = -adj; + r |= ((n1 >> norm_rshift) & norm_rmask); + for ( ; i > 0; i--) + { + n0 = ap[i-1 + adj]; + adj = -adj; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); + qp[i + adj] = q; + n1 = n0; + } + nshift = n1 << norm; + udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); + qp[0 + HALF_ENDIAN_ADJ(0)] = q; + } + qp -= xsize; + adj = HALF_ENDIAN_ADJ (0); + for (i = xsize-1; i >= 0; i--) + { + udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb); + adj = -adj; + qp[i + adj] = q; + } + + return r >> norm; + } + else + { + mp_srcptr ap; + mp_ptr qp; + mp_size_t size, xsize, total_size; + mp_limb_t d, n1, n0, q, r, dinv, nshift, norm_rmask; + int norm, norm_rshift; + + ap = ap_limbptr; + qp = qp_limbptr; + size = size_limbs; + xsize = xsize_limbs; + total_size = total_size_limbs; + d = d_limb; + + qp += total_size; /* above high limb */ + r = 0; /* initial remainder */ + + if (LIKELY (size != 0)) + { + /* Skip a division if high < divisor (high quotient 0). Testing + here before before normalizing will still skip as often as + possible. */ + n1 = ap[size-1]; + if (n1 < d) + { + r = n1; + *--qp = 0; + total_size--; + if (total_size == 0) + return r; + size--; + } + } + + count_leading_zeros (norm, d); + d <<= norm; + r <<= norm; + + norm_rshift = GMP_LIMB_BITS - norm; + norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0)); + + invert_limb (dinv, d); + + if (LIKELY (size != 0)) + { + n1 = ap[size-1]; + r |= ((n1 >> norm_rshift) & norm_rmask); + for (i = size-2; i >= 0; i--) + { + n0 = ap[i]; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); + *--qp = q; + n1 = n0; + } + nshift = n1 << norm; + udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); + *--qp = q; + } + for (i = 0; i < xsize; i++) + { + udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv); + *--qp = q; + } + return r >> norm; + } +} diff --git a/gmp-6.3.0/mpn/sparc64/gcd_11.asm b/gmp-6.3.0/mpn/sparc64/gcd_11.asm new file mode 100644 index 0000000..2dd200d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/gcd_11.asm @@ -0,0 +1,87 @@ +dnl SPARC64 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2021 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C UltraSPARC 1&2: 5.1 +C UltraSPARC 3: 5.0 +C UltraSPARC T1: 11.4 +C UltraSPARC T3: 10 +C UltraSPARC T4: 6 +C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1 + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + RODATA + TYPE(ctz_table,object) +ctz_table: + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') + SIZE(ctz_table,.-ctz_table) + +define(`u0', `%o0') +define(`v0', `%o1') + +ASM_START() +PROLOGUE(mpn_gcd_11) + LEA64(ctz_table, o5, g4) + b L(odd) + mov u0, %o4 + + ALIGN(16) +L(top): movcc %xcc, %o4, v0 C v = min(u,v) + movcc %xcc, %o2, %o0 C u = |v - u] +L(mid): ldub [%o5+%g1], %g5 C + brz,pn %g1, L(shift_alot) C + srlx %o0, %g5, %o4 C new u, odd +L(odd): subcc v0, %o4, %o2 C v - u, set flags for branch and movcc + sub %o4, v0, %o0 C u - v + bnz,pt %xcc, L(top) C + and %o2, MASK, %g1 C extract low MAXSHIFT bits from (v-u) + + retl + mov v0, %o0 + +L(shift_alot): + mov %o4, %o0 + b L(mid) + and %o4, MASK, %g1 C +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/gmp-mparam.h new file mode 100644 index 0000000..5ac2c46 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/gmp-mparam.h @@ -0,0 +1,139 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 500 MHz ultrasparc2 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 22 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 30 +#define MUL_TOOM33_THRESHOLD 187 +#define MUL_TOOM44_THRESHOLD 278 +#define MUL_TOOM6H_THRESHOLD 278 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 201 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 199 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 107 + +#define SQR_BASECASE_THRESHOLD 13 +#define SQR_TOOM2_THRESHOLD 69 +#define SQR_TOOM3_THRESHOLD 116 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 336 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 248 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 248, 5}, { 9, 4}, { 19, 6}, { 5, 5}, \ + { 15, 6}, { 8, 5}, { 17, 6}, { 21, 7}, \ + { 19, 8}, { 11, 7}, { 25, 8}, { 15, 7}, \ + { 31, 8}, { 27, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 50 +#define MUL_FFT_THRESHOLD 1984 + +#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 236, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 19, 7}, { 10, 6}, \ + { 21, 7}, { 21, 8}, { 21, 9}, { 11, 8}, \ + { 23, 9}, { 19, 8}, { 43, 9}, { 23,10}, \ + { 15, 9}, { 43,10}, { 23,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 47, 8}, { 191,11}, \ + { 31,10}, { 63, 8}, { 255, 7}, { 511, 9}, \ + { 135, 8}, { 271,10}, { 71, 9}, { 143, 8}, \ + { 287, 7}, { 575,11}, { 47, 9}, { 191, 8}, \ + { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 49 +#define SQR_FFT_THRESHOLD 1120 + +#define MULLO_BASECASE_THRESHOLD 16 +#define MULLO_DC_THRESHOLD 41 +#define MULLO_MUL_N_THRESHOLD 3791 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 100 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 174 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 13 +#define INV_APPR_THRESHOLD 9 + +#define BINV_NEWTON_THRESHOLD 187 +#define REDC_1_TO_REDC_2_THRESHOLD 10 +#define REDC_2_TO_REDC_N_THRESHOLD 115 + +#define MU_DIV_QR_THRESHOLD 680 +#define MU_DIVAPPR_Q_THRESHOLD 618 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 889 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 53 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 186 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 390 +#define SET_STR_PRECOMPUTE_THRESHOLD 1665 diff --git a/gmp-6.3.0/mpn/sparc64/lshift.asm b/gmp-6.3.0/mpn/sparc64/lshift.asm new file mode 100644 index 0000000..90bbb45 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/lshift.asm @@ -0,0 +1,140 @@ +dnl SPARC v9 mpn_lshift + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17.5 +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshift) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + sub up, 8, u1_off + add rp, (5 * 8), r1_off + + ldx [n + u1_off], u1 C WAS: up - 8 + add u1_off, (3 * 8), u1_off + + sub r1_off, 8, r0_off + sub u1_off, 8, u0_off + + subcc n, (3 * 8), n + srlx u1, tcnt, retval + + bl,pn %xcc, L(end12) + sllx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up - 16 + subcc n, (2 * 8), n + + ldx [n + u1_off], u1 C WAS: up - 24 + + bl,pn %xcc, L(end34) + srlx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + sllx u0, cnt, %l2 + or %l4, %l3, r0 + + ldx [n + u0_off], u0 C WAS: up - 16 + srlx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp - 8 + subcc n, (2 * 8), n + + sllx u1, cnt, %l3 + or %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up - 24 + srlx u0, tcnt, %l4 + + bge,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp - 16 + +L(end34): + sllx u0, cnt, %l2 + or %l4, %l3, r0 + + srlx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp - 8 + + or %l2, %l5, r1 + sub n, (2 * 8), %o5 + + sllx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp - 16 + +L(end12): + andcc n, 8, %g0 + bz,pn %xcc, L(done) + nop + + ldx [n + u0_off], u1 + srlx u1, tcnt, %l4 + or %l4, %l3, r0 + stx r0, [r0_off - 24] + sllx u1, cnt, %l3 +L(done): + stx %l3, [r0_off - 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/lshiftc.asm new file mode 100644 index 0000000..4a0f0a3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/lshiftc.asm @@ -0,0 +1,147 @@ +dnl SPARC v9 mpn_lshiftc + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 3 +C UltraSPARC 3: 3 +C UltraSPARC T1: 17 +C UltraSPARC T3: 10 +C UltraSPARC T4: 3.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + sub up, 8, u1_off + add rp, (5 * 8), r1_off + + ldx [n + u1_off], u1 C WAS: up - 8 + add u1_off, (3 * 8), u1_off + + sub r1_off, 8, r0_off + sub u1_off, 8, u0_off + + subcc n, (3 * 8), n + srlx u1, tcnt, retval + + bl,pn %xcc, L(end12) + sllx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up - 16 + subcc n, (2 * 8), n + + ldx [n + u1_off], u1 C WAS: up - 24 + + bl,pn %xcc, L(end34) + srlx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + not %l3, %l3 + sllx u0, cnt, %l2 + + andn %l3, %l4, r0 + ldx [n + u0_off], u0 C WAS: up - 16 + + srlx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp - 8 + + subcc n, (2 * 8), n + not %l2, %l2 + + sllx u1, cnt, %l3 + andn %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up - 24 + srlx u0, tcnt, %l4 + + bge,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp - 16 + +L(end34): + not %l3, %l3 + sllx u0, cnt, %l2 + + andn %l3, %l4, r0 + srlx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp - 8 + not %l2, %l2 + + andn %l2, %l5, r1 + sub n, (2 * 8), %o5 + + sllx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp - 16 + +L(end12): + andcc n, 8, %g0 + bz %xcc, L(done)+4 + not %l3, %l3 + + ldx [n + u0_off], u1 + srlx u1, tcnt, %l4 + andn %l3, %l4, r0 + stx r0, [r0_off - 24] + sllx u1, cnt, %l3 +L(done): + not %l3, %l3 + stx %l3, [r0_off - 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/mod_1.c b/gmp-6.3.0/mpn/sparc64/mod_1.c new file mode 100644 index 0000000..ab53f9d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mod_1.c @@ -0,0 +1,238 @@ +/* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder. + +Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: 160 120 +*/ + + +/* 32-bit divisors are treated in special case code. This requires 4 mulx + per limb instead of 8 in the general case. + + For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] + addressing, to get the two halves of each limb read in the correct order. + This is kept in an adj variable. Doing that measures about 6 c/l faster + than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't + be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc + 3.2.1 at least). + + A simple udivx/umulx loop for the 32-bit case was attempted for small + sizes, but at size==2 it was only about the same speed and at size==3 was + slower. */ + +static mp_limb_t +mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ + int norm, norm_rshift; + mp_limb_t src_high_limb; + mp_size_t i; + + ASSERT (size_limbs >= 0); + ASSERT (d_limb != 0); + + if (UNLIKELY (size_limbs == 0)) + return 0; + + src_high_limb = src_limbptr[size_limbs-1]; + + /* udivx is good for size==1, and no need to bother checking limb> 32; + + /* If the length of the source is uniformly distributed, then there's + a 50% chance of the high 32-bits being zero, which we can skip. */ + if (r == 0) + { + r = (unsigned) src_high_limb; + size--; + ASSERT (size > 0); /* because always even */ + } + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + if (r < d_limb) + { + size--; + ASSERT (size > 0); /* because size==1 handled above */ + } + else + r = 0; + + count_leading_zeros_32 (norm, d_limb); + norm -= 32; + d_limb <<= norm; + + norm_rshift = 32 - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + i = size-1; + adj = HALF_ENDIAN_ADJ (i); + n1 = src [i + adj]; + r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + + invert_half_limb (dinv_limb, d_limb); + adj = -adj; + + for (i--; i >= 0; i--) + { + n0 = src [i + adj]; + adj = -adj; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); + n1 = n0; + } + + /* same as loop, but without n0 */ + nshift = n1 << norm; + udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); + + ASSERT ((r & ((1 << norm) - 1)) == 0); + return r >> norm; + } + else + { + mp_srcptr src; + mp_size_t size; + mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask; + + src = src_limbptr; + size = size_limbs; + r = src_high_limb; /* initial remainder */ + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + if (r < d_limb) + { + size--; + ASSERT (size > 0); /* because size==1 handled above */ + } + else + r = 0; + + count_leading_zeros (norm, d_limb); + d_limb <<= norm; + + norm_rshift = GMP_LIMB_BITS - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + + src += size; + n1 = *--src; + r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + + invert_limb (dinv, d_limb); + + for (i = size-2; i >= 0; i--) + { + n0 = *--src; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); + n1 = n0; + } + + /* same as loop, but without n0 */ + nshift = n1 << norm; + udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); + + ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0); + return r >> norm; + } +} + +mp_limb_t +mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + ASSERT (n >= 0); + ASSERT (b != 0); + + /* Should this be handled at all? Rely on callers? Note un==0 is currently + required by mpz/fdiv_r_ui.c and possibly other places. */ + if (n == 0) + return 0; + + if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) + { + if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_anynorm (ap, n, b); + } + else + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b, pre); + } + } + else + { + if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_anynorm (ap, n, b); + } + else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b << pre[1], pre); + } + else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) + { + mp_limb_t pre[5]; + mpn_mod_1s_2p_cps (pre, b); + return mpn_mod_1s_2p (ap, n, b << pre[1], pre); + } + else + { + mp_limb_t pre[7]; + mpn_mod_1s_4p_cps (pre, b); + return mpn_mod_1s_4p (ap, n, b << pre[1], pre); + } + } +} diff --git a/gmp-6.3.0/mpn/sparc64/mod_1_4.c b/gmp-6.3.0/mpn/sparc64/mod_1_4.c new file mode 100644 index 0000000..735a402 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mod_1_4.c @@ -0,0 +1,235 @@ +/* mpn_mod_1s_4p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that d < B / 4. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + +void +mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 4); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + + udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi); + cps[6] = B5modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 6; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + B5modb = cps[6]; + + if ((b >> 32) == 0) + { + switch (n & 3) + { + case 0: + umul_ppmm_s (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]); + umul_ppmm_s (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm_s (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm_s (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm_s (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm_s (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm_s (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm_s (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + } + else + { + switch (n & 3) + { + case 0: + umul_ppmm (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]); + umul_ppmm (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, 0, cl); + } + + bi = cps[0]; + cnt = cps[1]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/sparc64/mode1o.c b/gmp-6.3.0/mpn/sparc64/mode1o.c new file mode 100644 index 0000000..771c999 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mode1o.c @@ -0,0 +1,196 @@ +/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: ? ? +*/ + + +/* This implementation reduces the number of multiplies done, knowing that + on ultrasparc 1 and 2 the mulx instruction stalls the whole chip. + + The key idea is to use the fact that the low limb of q*d equals l, this + being the whole purpose of the q calculated. It means there's no need to + calculate the lowest 32x32->64 part of the q*d, instead it can be + inferred from l and the other three 32x32->64 parts. See sparc64.h for + details. + + When d is 32-bits, the same applies, but in this case there's only one + other 32x32->64 part (ie. HIGH(q)*d). + + The net effect is that for 64-bit divisor each limb is 4 mulx, or for + 32-bit divisor each is 2 mulx. + + Enhancements: + + No doubt this could be done in assembler, if that helped the scheduling, + or perhaps guaranteed good code irrespective of the compiler. + + Alternatives: + + It might be possibly to use floating point. The loop is dominated by + multiply latency, so not sure if floats would improve that. One + possibility would be to take two limbs at a time, with a 128 bit inverse, + if there's enough registers, which could effectively use float throughput + to reduce total latency across two limbs. */ + +#define ASSERT_RETVAL(r) \ + ASSERT (orig_c < d ? r < d : r <= d) + +mp_limb_t +mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) +{ + mp_limb_t c = orig_c; + mp_limb_t s, l, q, h, inverse; + + ASSERT (size >= 1); + ASSERT (d & 1); + ASSERT_MPN (src, size); + ASSERT_LIMB (d); + ASSERT_LIMB (c); + + /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */ + if (size == 1) + { + s = src[0]; + if (s > c) + { + l = s-c; + h = l % d; + if (h != 0) + h = d - h; + } + else + { + l = c-s; + h = l % d; + } + return h; + } + + binvert_limb (inverse, d); + + if (d <= 0xFFFFFFFF) + { + s = *src++; + size--; + do + { + SUBC_LIMB (c, l, s, c); + s = *src++; + q = l * inverse; + umul_ppmm_half_lowequal (h, q, d, l); + c += h; + size--; + } + while (size != 0); + + if (s <= d) + { + /* With high s <= d the final step can be a subtract and addback. + If c==0 then the addback will restore to l>=0. If c==d then + will get l==d if s==0, but that's ok per the function + definition. */ + + l = c - s; + l += (l > c ? d : 0); + + ASSERT_RETVAL (l); + return l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + SUBC_LIMB (c, l, s, c); + q = l * inverse; + umul_ppmm_half_lowequal (h, q, d, l); + c += h; + + ASSERT_RETVAL (c); + return c; + } + } + else + { + mp_limb_t dl = LOW32 (d); + mp_limb_t dh = HIGH32 (d); + long i; + + s = *src++; + size--; + do + { + SUBC_LIMB (c, l, s, c); + s = *src++; + q = l * inverse; + umul_ppmm_lowequal (h, q, d, dh, dl, l); + c += h; + size--; + } + while (size != 0); + + if (s <= d) + { + /* With high s <= d the final step can be a subtract and addback. + If c==0 then the addback will restore to l>=0. If c==d then + will get l==d if s==0, but that's ok per the function + definition. */ + + l = c - s; + l += (l > c ? d : 0); + + ASSERT_RETVAL (l); + return l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + SUBC_LIMB (c, l, s, c); + q = l * inverse; + umul_ppmm_lowequal (h, q, d, dh, dl, l); + c += h; + + ASSERT_RETVAL (c); + return c; + } + } +} diff --git a/gmp-6.3.0/mpn/sparc64/rshift.asm b/gmp-6.3.0/mpn/sparc64/rshift.asm new file mode 100644 index 0000000..3f8e11f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/rshift.asm @@ -0,0 +1,142 @@ +dnl SPARC v9 mpn_rshift + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17.5 +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_rshift) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + add up, n, up + add rp, n, rp + + neg n, n + sub up, (2 * 8), u0_off + sub rp, (5 * 8), r0_off + + ldx [n + up], u1 C WAS: up + 0 + sub u0_off, (1 * 8), u1_off + sub r0_off, (1 * 8), r1_off + + subcc n, -(3 * 8), n + sllx u1, tcnt, retval + + bg,pn %xcc, L(end12) + srlx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up + 0 + subcc n, -(2 * 8), n + + ldx [n + u1_off], u1 C WAS: up + 8 + + bg,pn %xcc, L(end34) + sllx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + srlx u0, cnt, %l2 + or %l3, %l4, r0 + + ldx [n + u0_off], u0 C WAS: up + 0 + sllx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp + 0 + subcc n, -(2 * 8), n + + srlx u1, cnt, %l3 + or %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up + 8 + sllx u0, tcnt, %l4 + + ble,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp + 8 + +L(end34): + srlx u0, cnt, %l2 + or %l3, %l4, r0 + + sllx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp + 0 + + or %l2, %l5, r1 + sub n, -(2 * 8), %o5 + + srlx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp + 8 + +L(end12): + andcc n, 8, %g0 + bz,pn %xcc, L(done) + nop + + ldx [n + u0_off], u1 + sllx u1, tcnt, %l4 + or %l3, %l4, r0 + stx r0, [r0_off + 24] + srlx u1, cnt, %l3 +L(done): + stx %l3, [r0_off + 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm b/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm new file mode 100644 index 0000000..22e0dc5 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm @@ -0,0 +1,162 @@ +dnl SPARC v9 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund and David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 hopefully +C UltraSPARC 3: 3 +C UltraSPARC T1: 17 +C UltraSPARC T3: ? +C UltraSPARC T4/T5: 2.25 hopefully + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`tp', `%i1') +define(`n', `%i2') +define(`nents', `%i3') +define(`which', `%i4') + +define(`i', `%g1') +define(`j', `%g3') +define(`stride', `%g4') +define(`tporig', `%g5') +define(`mask', `%o0') + +define(`data0', `%l0') +define(`data1', `%l1') +define(`data2', `%l2') +define(`data3', `%l3') +define(`t0', `%l4') +define(`t1', `%l5') +define(`t2', `%l6') +define(`t3', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sec_tabselect) + save %sp, -176, %sp + + sllx n, 3, stride + sub n, 4, j + brlz j, L(outer_end) + mov tp, tporig + +L(outer_loop): + clr data0 + clr data1 + clr data2 + clr data3 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(top): subcc %o1, 1, %o1 C set carry iff o1 = 0 + ldx [tp + 0], t0 + subc %g0, %g0, mask + ldx [tp + 8], t1 + sub i, 1, i + ldx [tp + 16], t2 + ldx [tp + 24], t3 + add tp, stride, tp + and t0, mask, t0 + and t1, mask, t1 + or t0, data0, data0 + and t2, mask, t2 + or t1, data1, data1 + and t3, mask, t3 + or t2, data2, data2 + brnz i, L(top) + or t3, data3, data3 + + stx data0, [rp + 0] + subcc j, 4, j + stx data1, [rp + 8] + stx data2, [rp + 16] + stx data3, [rp + 24] + add tporig, (4 * 8), tporig + + brgez j, L(outer_loop) + add rp, (4 * 8), rp +L(outer_end): + + + andcc n, 2, %g0 + be L(b0x) + nop +L(b1x): clr data0 + clr data1 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(tp2): subcc %o1, 1, %o1 + ldx [tp + 0], t0 + subc %g0, %g0, mask + ldx [tp + 8], t1 + sub i, 1, i + add tp, stride, tp + and t0, mask, t0 + and t1, mask, t1 + or t0, data0, data0 + brnz i, L(tp2) + or t1, data1, data1 + + stx data0, [rp + 0] + stx data1, [rp + 8] + add tporig, (2 * 8), tporig + add rp, (2 * 8), rp + + +L(b0x): andcc n, 1, %g0 + be L(b00) + nop +L(b01): clr data0 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(tp1): subcc %o1, 1, %o1 + ldx [tp + 0], t0 + subc %g0, %g0, mask + sub i, 1, i + add tp, stride, tp + and t0, mask, t0 + brnz i, L(tp1) + or t0, data0, data0 + + stx data0, [rp + 0] + +L(b00): ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/sparc64.h b/gmp-6.3.0/mpn/sparc64/sparc64.h new file mode 100644 index 0000000..8698a82 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/sparc64.h @@ -0,0 +1,217 @@ +/* UltraSPARC 64 support macros. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define LOW32(x) ((x) & 0xFFFFFFFF) +#define HIGH32(x) ((x) >> 32) + + +/* Halfword number i in src is accessed as src[i+HALF_ENDIAN_ADJ(i)]. + Plain src[i] would be incorrect in big endian, HALF_ENDIAN_ADJ has the + effect of swapping the two halves in this case. */ +#if HAVE_LIMB_BIG_ENDIAN +#define HALF_ENDIAN_ADJ(i) (1 - (((i) & 1) << 1)) /* +1 even, -1 odd */ +#endif +#if HAVE_LIMB_LITTLE_ENDIAN +#define HALF_ENDIAN_ADJ(i) 0 /* no adjust */ +#endif +#ifndef HALF_ENDIAN_ADJ +Error, error, unknown limb endianness; +#endif + + +/* umul_ppmm_lowequal sets h to the high limb of q*d, assuming the low limb + of that product is equal to l. dh and dl are the 32-bit halves of d. + + |-----high----||----low-----| + +------+------+ + | | ph = qh * dh + +------+------+ + +------+------+ + | | pm1 = ql * dh + +------+------+ + +------+------+ + | | pm2 = qh * dl + +------+------+ + +------+------+ + | | pl = ql * dl (not calculated) + +------+------+ + + Knowing that the low 64 bits is equal to l means that LOW(pm1) + LOW(pm2) + + HIGH(pl) == HIGH(l). The only thing we need from those product parts + is whether they produce a carry into the high. + + pm_l = LOW(pm1)+LOW(pm2) is done to contribute its carry, then the only + time there's a further carry from LOW(pm_l)+HIGH(pl) is if LOW(pm_l) > + HIGH(l). pl is never actually calculated. */ + +#define umul_ppmm_lowequal(h, q, d, dh, dl, l) \ + do { \ + mp_limb_t ql, qh, ph, pm1, pm2, pm_l; \ + ASSERT (dh == HIGH32(d)); \ + ASSERT (dl == LOW32(d)); \ + ASSERT (q*d == l); \ + \ + ql = LOW32 (q); \ + qh = HIGH32 (q); \ + \ + pm1 = ql * dh; \ + pm2 = qh * dl; \ + ph = qh * dh; \ + \ + pm_l = LOW32 (pm1) + LOW32 (pm2); \ + \ + (h) = ph + HIGH32 (pm1) + HIGH32 (pm2) \ + + HIGH32 (pm_l) + ((pm_l << 32) > l); \ + \ + ASSERT_HIGH_PRODUCT (h, q, d); \ + } while (0) + + +/* Set h to the high of q*d, assuming the low limb of that product is equal + to l, and that d fits in 32-bits. + + |-----high----||----low-----| + +------+------+ + | | pm = qh * dl + +------+------+ + +------+------+ + | | pl = ql * dl (not calculated) + +------+------+ + + Knowing that LOW(pm) + HIGH(pl) == HIGH(l) (mod 2^32) means that the only + time there's a carry from that sum is when LOW(pm) > HIGH(l). There's no + need to calculate pl to determine this. */ + +#define umul_ppmm_half_lowequal(h, q, d, l) \ + do { \ + mp_limb_t pm; \ + ASSERT (q*d == l); \ + ASSERT (HIGH32(d) == 0); \ + \ + pm = HIGH32(q) * d; \ + (h) = HIGH32(pm) + ((pm << 32) > l); \ + ASSERT_HIGH_PRODUCT (h, q, d); \ + } while (0) + + +/* check that h is the high limb of x*y */ +#if WANT_ASSERT +#define ASSERT_HIGH_PRODUCT(h, x, y) \ + do { \ + mp_limb_t want_h, dummy; \ + umul_ppmm (want_h, dummy, x, y); \ + ASSERT (h == want_h); \ + } while (0) +#else +#define ASSERT_HIGH_PRODUCT(h, q, d) \ + do { } while (0) +#endif + + +/* Multiply u anv v, where v < 2^32. */ +#define umul_ppmm_s(w1, w0, u, v) \ + do { \ + UWtype __x0, __x2; \ + UWtype __ul, __vl, __uh; \ + UWtype __u = (u), __v = (v); \ + \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x2 = (UWtype) __uh * __vl; \ + \ + (w1) = (__x2 + (__x0 >> W_TYPE_SIZE/2)) >> W_TYPE_SIZE/2; \ + (w0) = (__x2 << W_TYPE_SIZE/2) + __x0; \ + } while (0) + +/* Count the leading zeros on a limb, but assuming it fits in 32 bits. + The count returned will be in the range 32 to 63. + This is the 32-bit generic C count_leading_zeros from longlong.h. */ +#define count_leading_zeros_32(count, x) \ + do { \ + mp_limb_t __xr = (x); \ + unsigned __a; \ + ASSERT ((x) != 0); \ + ASSERT ((x) <= CNST_LIMB(0xFFFFFFFF)); \ + __a = __xr < ((UWtype) 1 << 16) ? (__xr < ((UWtype) 1 << 8) ? 1 : 8 + 1) \ + : (__xr < ((UWtype) 1 << 24) ? 16 + 1 : 24 + 1); \ + \ + (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ + } while (0) + + +/* Set inv to a 32-bit inverse floor((b*(b-d)-1) / d), knowing that d fits + 32 bits and is normalized (high bit set). */ +#define invert_half_limb(inv, d) \ + do { \ + mp_limb_t _n; \ + ASSERT ((d) <= 0xFFFFFFFF); \ + ASSERT ((d) & 0x80000000); \ + _n = (((mp_limb_t) -(d)) << 32) - 1; \ + (inv) = (mp_limb_t) (unsigned) (_n / (d)); \ + } while (0) + + +/* Divide nh:nl by d, setting q to the quotient and r to the remainder. + q, r, nh and nl are 32-bits each, d_limb is 32-bits but in an mp_limb_t, + dinv_limb is similarly a 32-bit inverse but in an mp_limb_t. */ + +#define udiv_qrnnd_half_preinv(q, r, nh, nl, d_limb, dinv_limb) \ + do { \ + unsigned _n2, _n10, _n1, _nadj, _q11n, _xh, _r, _q; \ + mp_limb_t _n, _x; \ + ASSERT (d_limb <= 0xFFFFFFFF); \ + ASSERT (dinv_limb <= 0xFFFFFFFF); \ + ASSERT (d_limb & 0x80000000); \ + ASSERT (nh < d_limb); \ + _n10 = (nl); \ + _n2 = (nh); \ + _n1 = (int) _n10 >> 31; \ + _nadj = _n10 + (_n1 & d_limb); \ + _x = dinv_limb * (_n2 - _n1) + _nadj; \ + _q11n = ~(_n2 + HIGH32 (_x)); /* -q1-1 */ \ + _n = ((mp_limb_t) _n2 << 32) + _n10; \ + _x = _n + d_limb * _q11n; /* n-q1*d-d */ \ + _xh = HIGH32 (_x) - d_limb; /* high(n-q1*d-d) */ \ + ASSERT (_xh == 0 || _xh == ~0); \ + _r = _x + (d_limb & _xh); /* addback */ \ + _q = _xh - _q11n; /* q1+1-addback */ \ + ASSERT (_r < d_limb); \ + ASSERT (d_limb * _q + _r == _n); \ + (r) = _r; \ + (q) = _q; \ + } while (0) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm new file mode 100644 index 0000000..92374d2 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u+v+carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') +define(`v0', `%l1') +define(`v1', `%l3') +define(`v2', `%l5') +define(`v3', `%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_add_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + add u0,v0,%g1 C main add + add %g1,cy,%g5 C carry add + or u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: andn %g2,%g5,%g2 + and u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + add %g1,cy,%g5 + or u1,v1,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + add %g1,cy,%g5 + or u2,v2,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + add %g1,cy,%g5 + or u3,v3,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + or %g3,%g2,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + add u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + add %g1,cy,%g5 + or u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + andn %g2,%g5,%g2 + and u0,v0,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + add %g1,cy,%g5 + or u1,v1,%g2 + andn %g2,%g5,%g2 + and u1,v1,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + add %g1,cy,%g5 + or u2,v2,%g2 + andn %g2,%g5,%g2 + and u2,v2,%g3 + or %g3,%g2,%g2 + add rp,32,rp + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + add %g1,cy,%g5 + or u3,v3,%g2 + andn %g2,%g5,%g2 + and u3,v3,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + add u0,v0,%g1 + or u0,v0,%g2 + add %g1,cy,%g5 + and u0,v0,%g3 + andn %g2,%g5,%g2 + stx %g5,[rp-8] + or %g3,%g2,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm new file mode 100644 index 0000000..48a9414 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm @@ -0,0 +1,606 @@ +dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 17.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the up operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 0. Rewrite to use algorithm of mpn_addmul_2. +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 12 MEM +C 10 ISHIFT + 14 IADDLOG +C 1 BRANCH +C 55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_addmul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + ldx [%i0+%i2], rlimb C read rp[i] + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + add i00, %g5, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm new file mode 100644 index 0000000..37674d7 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm @@ -0,0 +1,551 @@ +dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb +dnl number and add the result to a n limb vector. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 9 +C UltraSPARC 3: 10 + +C Algorithm: We use 16 floating-point multiplies per limb product, with the +C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand +C split into 32-bit pieces. We sum four 48-bit partial products using +C floating-point add, then convert the resulting four 50-bit quantities and +C transfer them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 50-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Perform two of the fp->int conversions with integer instructions. We +C can get almost ten free IEU slots, if we clean up bookkeeping and the +C silly carry-limb code. +C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb +C code. + +C OSP (Overlapping software pipeline) version of mpn_mul_basecase: +C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. +C FI = 20 +C L = 9 x un * vn +C WDFI = 10 x vn / 2 +C WD = 4 + +C Instruction classification (as per UltraSPARC functional units). +C Assuming silly carry code is fixed. Includes bookkeeping. +C +C mpn_addmul_X mpn_mul_X +C 1 2 1 2 +C ========== ========== +C FM 8 16 8 16 +C FA 10 18 10 18 +C MEM 12 12 10 10 +C ISHIFT 6 6 6 6 +C IADDLOG 11 11 10 10 +C BRANCH 1 1 1 1 +C +C TOTAL IEU 17 17 16 16 +C TOTAL 48 64 45 61 +C +C IEU cycles 8.5 8.5 8 8 +C MEM cycles 12 12 10 10 +C ISSUE cycles 12 16 11.25 15.25 +C FPU cycles 10 18 10 18 +C cycles/loop 12 18 12 18 +C cycles/limb 12 9 12 9 + + +C INPUT PARAMETERS +C rp[n + 1] i0 +C up[n] i1 +C n i2 +C vp[2] i3 + + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +C Combine registers: +C u00_hi= u32_hi +C u00_lo= u32_lo +C a000 = out000 +C a016 = out016 +C Free: f52 f54 + + +define(`p000', `%f8') define(`p016',`%f10') +define(`p032',`%f12') define(`p048',`%f14') +define(`p064',`%f16') define(`p080',`%f18') +define(`p096a',`%f20') define(`p112a',`%f22') +define(`p096b',`%f56') define(`p112b',`%f58') + +define(`out000',`%f0') define(`out016',`%f6') + +define(`v000',`%f24') define(`v016',`%f26') +define(`v032',`%f28') define(`v048',`%f30') +define(`v064',`%f44') define(`v080',`%f46') +define(`v096',`%f48') define(`v112',`%f50') + +define(`u00',`%f32') define(`u32', `%f34') + +define(`a000',`%f36') define(`a016',`%f38') +define(`a032',`%f40') define(`a048',`%f42') +define(`a064',`%f60') define(`a080',`%f62') + +define(`u00_hi',`%f2') define(`u32_hi',`%f4') +define(`u00_lo',`%f3') define(`u32_lo',`%f5') + +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') +define(`r00',`%l2') define(`r32',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + + +PROLOGUE(mpn_addmul_2) + +C Initialization. (1) Split v operand into eight 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. +C This code could be better scheduled. + + save %sp, -256, %sp + +ifdef(`HAVE_VIS', +` mov -1, %g4 + wr %g0, 0xD2, %asi + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + ldda [%i3+6] %asi, v000 + ldda [%i3+4] %asi, v016 + ldda [%i3+2] %asi, v032 + ldda [%i3+0] %asi, v048 + fxtod v000, v000 + ldda [%i3+14] %asi, v064 + fxtod v016, v016 + ldda [%i3+12] %asi, v080 + fxtod v032, v032 + ldda [%i3+10] %asi, v096 + fxtod v048, v048 + ldda [%i3+8] %asi, v112 + fxtod v064, v064 + fxtod v080, v080 + fxtod v096, v096 + fxtod v112, v112 + fzero u00_hi + fzero u32_hi +', +` mov -1, %g4 + ldx [%i3+0], %l0 C vp[0] + srlx %g4, 48, xffff C store mask in register `xffff' + ldx [%i3+8], %l1 C vp[1] + + and %l0, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %l0, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %l0, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %l0, 48, %g3 + stx %g3, [%sp+2223+24] + and %l1, xffff, %g2 + stx %g2, [%sp+2223+32] + srlx %l1, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+40] + srlx %l1, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+48] + srlx %l1, 48, %g3 + stx %g3, [%sp+2223+56] + + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + ldd [%sp+2223+0], v000 + ldd [%sp+2223+8], v016 + ldd [%sp+2223+16], v032 + ldd [%sp+2223+24], v048 + fxtod v000, v000 + ldd [%sp+2223+32], v064 + fxtod v016, v016 + ldd [%sp+2223+40], v080 + fxtod v032, v032 + ldd [%sp+2223+48], v096 + fxtod v048, v048 + ldd [%sp+2223+56], v112 + fxtod v064, v064 + ld [%sp+2223+0], u00_hi C zero u00_hi + fxtod v080, v080 + ld [%sp+2223+0], u32_hi C zero u32_hi + fxtod v096, v096 + fxtod v112, v112 +') +C Initialization done. + mov 0, %g2 + mov 0, rlimb + mov 0, %g4 + add %i0, -8, %i0 C BOOKKEEPING + +C Start software pipeline. + + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fxtod u00_hi, u00 +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fmuld u00, v000, a000 + fmuld u00, v016, a016 + fmuld u00, v032, a032 + fmuld u00, v048, a048 + add %i2, -1, %i2 C BOOKKEEPING + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + fxtod u32_hi, u32 + fmuld u00, v080, p080 + fmuld u00, v096, p096a + brnz,pt %i2, .L_2_or_more + fmuld u00, v112, p112a + +.L1: fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + fdtox a000, out000 + fdtox a016, out016 + faddd p064, p096a, a064 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + b .L_wd2 + std out016, [%sp+2223+8] + +.L_2_or_more: + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fdtox a000, out000 + fmuld u00, v000, p000 + fdtox a016, out016 + fmuld u00, v016, p016 + faddd p064, p096a, a064 + fmuld u00, v032, p032 + faddd p080, p112a, a080 + fmuld u00, v048, p048 + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 + faddd p016, a048, a016 + fmuld u00, v080, p080 + faddd p032, a064, a032 + fmuld u00, v096, p096a + faddd p048, a080, a048 + brnz,pt %i2, .L_3_or_more + fmuld u00, v112, p112a + + b .Lend + nop + +C 64 32 0 +C . . . +C . |__rXXX_| 32 +C . |___cy___| 34 +C . |_______i00__| 50 +C |_______i16__| . 50 + + +C BEGIN MAIN LOOP + .align 16 +.L_3_or_more: +.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 +C + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 +C + nop + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 +C + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + fxtod u00_hi, u00 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b +C + stw %l5, [%i0+4] + nop + faddd p048, a080, a048 + fmuld u32, v112, p112b +C midloop + ld [%i1+0], u32_lo C read high 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u00, v000, p000 +C + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u00, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + fmuld u00, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + fmuld u00, v048, p048 +C + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 +C + add i00, r32, rlimb + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u00, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u00, v096, p096a +C + stw %l5, [%i0+0] + faddd p048, a080, a048 + brnz,pt %i2, .Loop + fmuld u00, v112, p112a +C END MAIN LOOP + +C WIND-DOWN PHASE 1 +.Lend: and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b + stw %l5, [%i0+4] + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + and %g2, xffffffff, %g2 + fdtox a000, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 2 +.L_wd2: and %g2, xffffffff, %g2 + fdtox a032, out000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a048, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + fdtox a064, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a080, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 3 +.L_wd3: and %g2, xffffffff, %g2 + fdtox p096b, out000 + add %g2, rlimb, %l5 + fdtox p112b, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+0], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + + sllx i16, 16, %g2 + add i00, cy, cy + return %i7+8 + add %g2, cy, %o0 +EPILOGUE(mpn_addmul_2) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm new file mode 100644 index 0000000..47286d5 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm @@ -0,0 +1,165 @@ +dnl SPARC v9 mpn_lshiftc + +dnl Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 3 +C UltraSPARC 3: 2.67 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt',`%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') + +define(`tnc',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) + save %sp,-160,%sp + + sllx n,3,%g1 + sub %g0,cnt,tnc C negate shift count + add up,%g1,up C make %o1 point at end of src + add rp,%g1,rp C make %o0 point at end of res + ldx [up-8],u3 C load first limb + subcc n,5,n + srlx u3,tnc,%i5 C compute function result + bl,pn %xcc,.Lend1234 + sllx u3,cnt,%g3 + + subcc n,4,n + ldx [up-16],u0 + ldx [up-24],u1 + add up,-32,up + ldx [up-0],u2 + ldx [up-8],u3 + srlx u0,tnc,%g2 + bl,pn %xcc,.Lend5678 + not %g3, %g3 + + b,a .Loop + ALIGN(16) +.Loop: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-16],u0 + fanop +C -- + srlx u1,tnc,%g2 + subcc n,4,n + stx %g3,[rp-8] + not %g1, %g1 +C -- + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-24],u1 + fanop +C -- + srlx u2,tnc,%g2 + stx %g1,[rp-16] + add up,-32,up + not %g3, %g3 +C -- + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-0],u2 + fanop +C -- + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 +C -- + sllx u3,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-8],u3 + fanop +C -- + srlx u0,tnc,%g2 + stx %g1,[rp-0] + bge,pt %xcc,.Loop + not %g3, %g3 +C -- +.Lend5678: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + srlx u1,tnc,%g2 + stx %g3,[rp-8] + not %g1, %g1 + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + srlx u2,tnc,%g2 + stx %g1,[rp-16] + not %g3, %g3 + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 + sllx u3,cnt,%g3 C carry... + andn %g1,%g2,%g1 + stx %g1,[rp-0] + +.Lend1234: + addcc n,4,n + bz,pn %xcc,.Lret + fanop +.Loop0: + add rp,-8,rp + subcc n,1,n + ldx [up-16],u3 + add up,-8,up + srlx u3,tnc,%g2 + not %g3, %g3 + andn %g3,%g2,%g3 + stx %g3,[rp] + sllx u3,cnt,%g3 + bnz,pt %xcc,.Loop0 + fanop +.Lret: + not %g3, %g3 + stx %g3,[rp-8] + mov %i5,%i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm new file mode 100644 index 0000000..871d562 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm @@ -0,0 +1,580 @@ +dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 18.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the s1 operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before s1?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 11 MEM +C 9 ISHIFT + 10? IADDLOG +C 1 BRANCH +C 49 insns totally (plus three mov insns that should be optimized out) + +C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain 3.79 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_mul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + mov i00, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm new file mode 100644 index 0000000..43c69d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm @@ -0,0 +1,342 @@ +dnl SPARC v9 64-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 22 +C UltraSPARC 3: 36 + +C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the +C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal +C code using the same algorithm. For 1-3 limbs, a special loop was generated, +C which causes performance problems in particular for 2 and 3 limbs. +C Ultimately, this should be replaced by hand-written code in the same software +C pipeline style as e.g., addmul_1.asm. + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diagonal) + save %sp, -240, %sp + + sethi %hi(0x1ffc00), %o0 + sethi %hi(0x3ffc00), %o1 + add %o0, 1023, %o7 + cmp %i2, 4 + add %o1, 1023, %o4 + or %g0, %i1, %g1 + or %g0, %i0, %o0 + bl,pn %xcc, .Lsmall + or %g0, 0, %g2 + + ldx [%i1], %o1 + add %i1, 24, %g1 + or %g0, 3, %g2 + srlx %o1, 42, %g3 + stx %g3, [%sp+2279] + and %o1, %o7, %o2 + stx %o2, [%sp+2263] + srlx %o1, 21, %o1 + ldd [%sp+2279], %f0 + and %o1, %o7, %o1 + stx %o1, [%sp+2271] + ldx [%i1+8], %o2 + fxtod %f0, %f12 + srlx %o2, 21, %o1 + and %o2, %o7, %g3 + ldd [%sp+2263], %f2 + fmuld %f12, %f12, %f10 + srlx %o2, 42, %o2 + ldd [%sp+2271], %f0 + and %o1, %o7, %o1 + fxtod %f2, %f8 + stx %o2, [%sp+2279] + stx %o1, [%sp+2271] + fxtod %f0, %f0 + stx %g3, [%sp+2263] + fdtox %f10, %f14 + fmuld %f12, %f8, %f6 + ldx [%i1+16], %o2 + std %f14, [%sp+2255] + fmuld %f0, %f0, %f2 + fmuld %f8, %f8, %f10 + srlx %o2, 42, %o1 + faddd %f6, %f6, %f6 + fmuld %f12, %f0, %f12 + fmuld %f0, %f8, %f8 + ldd [%sp+2279], %f0 + ldd [%sp+2263], %f4 + fdtox %f10, %f10 + std %f10, [%sp+2239] + faddd %f2, %f6, %f6 + ldd [%sp+2271], %f2 + fdtox %f12, %f12 + std %f12, [%sp+2247] + fdtox %f8, %f8 + std %f8, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2223] + +.Loop: srlx %o2, 21, %g3 + stx %o1, [%sp+2279] + add %g2, 1, %g2 + and %g3, %o7, %o1 + ldx [%sp+2255], %g4 + cmp %g2, %i2 + stx %o1, [%sp+2271] + add %g1, 8, %g1 + add %o0, 16, %o0 + ldx [%sp+2239], %o1 + fxtod %f0, %f10 + fxtod %f4, %f14 + ldx [%sp+2231], %i0 + ldx [%sp+2223], %g5 + ldx [%sp+2247], %g3 + and %o2, %o7, %o2 + fxtod %f2, %f8 + fmuld %f10, %f10, %f0 + stx %o2, [%sp+2263] + fmuld %f10, %f14, %f6 + ldx [%g1-8], %o2 + fmuld %f10, %f8, %f12 + fdtox %f0, %f2 + ldd [%sp+2279], %f0 + fmuld %f8, %f8, %f4 + faddd %f6, %f6, %f6 + fmuld %f14, %f14, %f10 + std %f2, [%sp+2255] + sllx %g4, 20, %g4 + ldd [%sp+2271], %f2 + fmuld %f8, %f14, %f8 + sllx %i0, 22, %i1 + fdtox %f12, %f12 + std %f12, [%sp+2247] + sllx %g5, 42, %i0 + add %o1, %i1, %o1 + faddd %f4, %f6, %f6 + ldd [%sp+2263], %f4 + add %o1, %i0, %o1 + add %g3, %g4, %g3 + fdtox %f10, %f10 + std %f10, [%sp+2239] + srlx %o1, 42, %g4 + and %g5, %o4, %i0 + fdtox %f8, %f8 + std %f8, [%sp+2231] + srlx %g5, 22, %g5 + sub %g4, %i0, %g4 + fdtox %f6, %f6 + std %f6, [%sp+2223] + srlx %g4, 63, %g4 + add %g3, %g5, %g3 + add %g3, %g4, %g3 + stx %o1, [%o0-16] + srlx %o2, 42, %o1 + bl,pt %xcc, .Loop + stx %g3, [%o0-8] + + stx %o1, [%sp+2279] + srlx %o2, 21, %o1 + fxtod %f0, %f16 + ldx [%sp+2223], %g3 + fxtod %f4, %f6 + and %o2, %o7, %o3 + stx %o3, [%sp+2263] + fxtod %f2, %f4 + and %o1, %o7, %o1 + ldx [%sp+2231], %o2 + sllx %g3, 42, %g4 + fmuld %f16, %f16, %f14 + stx %o1, [%sp+2271] + fmuld %f16, %f6, %f8 + add %o0, 48, %o0 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f4, %f4, %f10 + ldx [%sp+2255], %o3 + fdtox %f14, %f14 + fmuld %f4, %f6, %f2 + std %f14, [%sp+2255] + faddd %f8, %f8, %f12 + add %o1, %o2, %o2 + fmuld %f16, %f4, %f4 + ldd [%sp+2279], %f0 + sllx %o3, 20, %g5 + add %o2, %g4, %o2 + fmuld %f6, %f6, %f6 + srlx %o2, 42, %o3 + and %g3, %o4, %g4 + srlx %g3, 22, %g3 + faddd %f10, %f12, %f16 + ldd [%sp+2271], %f12 + ldd [%sp+2263], %f8 + fxtod %f0, %f0 + sub %o3, %g4, %o3 + ldx [%sp+2247], %o1 + srlx %o3, 63, %o3 + fdtox %f2, %f10 + fxtod %f8, %f8 + std %f10, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2239] + add %o1, %g5, %o1 + fmuld %f0, %f0, %f2 + fdtox %f16, %f16 + std %f16, [%sp+2223] + add %o1, %g3, %o1 + fdtox %f4, %f4 + std %f4, [%sp+2247] + fmuld %f0, %f8, %f10 + fxtod %f12, %f12 + add %o1, %o3, %o1 + stx %o2, [%o0-48] + fmuld %f8, %f8, %f6 + stx %o1, [%o0-40] + fdtox %f2, %f2 + ldx [%sp+2231], %o2 + faddd %f10, %f10, %f10 + ldx [%sp+2223], %g3 + fmuld %f12, %f12, %f4 + fdtox %f6, %f6 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f12, %f8, %f8 + sllx %g3, 42, %g5 + ldx [%sp+2255], %o3 + fmuld %f0, %f12, %f0 + add %o1, %o2, %o2 + faddd %f4, %f10, %f4 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + and %g3, %o4, %g4 + fdtox %f8, %f8 + sllx %o3, 20, %g5 + std %f8, [%sp+2231] + fdtox %f0, %f0 + srlx %o2, 42, %o3 + add %o1, %g5, %o1 + fdtox %f4, %f4 + srlx %g3, 22, %g3 + sub %o3, %g4, %o3 + std %f6, [%sp+2239] + std %f4, [%sp+2223] + srlx %o3, 63, %o3 + add %o1, %g3, %o1 + std %f2, [%sp+2255] + add %o1, %o3, %o1 + std %f0, [%sp+2247] + stx %o2, [%o0-32] + stx %o1, [%o0-24] + ldx [%sp+2231], %o2 + ldx [%sp+2223], %o3 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + ldx [%sp+2255], %g4 + and %o3, %o4, %g3 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0-16] + sllx %g4, 20, %g4 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0-8] + ret + restore %g0, %g0, %g0 +.Lsmall: + ldx [%g1], %o2 +.Loop0: + and %o2, %o7, %o1 + stx %o1, [%sp+2263] + add %g2, 1, %g2 + srlx %o2, 21, %o1 + add %g1, 8, %g1 + srlx %o2, 42, %o2 + stx %o2, [%sp+2279] + and %o1, %o7, %o1 + ldd [%sp+2263], %f0 + cmp %g2, %i2 + stx %o1, [%sp+2271] + fxtod %f0, %f6 + ldd [%sp+2279], %f0 + ldd [%sp+2271], %f4 + fxtod %f0, %f2 + fmuld %f6, %f6, %f0 + fxtod %f4, %f10 + fmuld %f2, %f6, %f4 + fdtox %f0, %f0 + std %f0, [%sp+2239] + fmuld %f10, %f6, %f8 + fmuld %f10, %f10, %f0 + faddd %f4, %f4, %f6 + fmuld %f2, %f2, %f4 + fdtox %f8, %f8 + std %f8, [%sp+2231] + fmuld %f2, %f10, %f2 + faddd %f0, %f6, %f0 + fdtox %f4, %f4 + std %f4, [%sp+2255] + fdtox %f2, %f2 + std %f2, [%sp+2247] + fdtox %f0, %f0 + std %f0, [%sp+2223] + ldx [%sp+2239], %o1 + ldx [%sp+2255], %g4 + ldx [%sp+2231], %o2 + sllx %g4, 20, %g4 + ldx [%sp+2223], %o3 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0] + and %o3, %o4, %g3 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0+8] + add %o0, 16, %o0 + bl,a,pt %xcc, .Loop0 + ldx [%g1], %o2 + ret + restore %g0, %g0, %g0 +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm new file mode 100644 index 0000000..9fb7f70 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u-v-carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp',`%i0') +define(`up',`%i1') +define(`vp',`%i2') +define(`n',`%i3') + +define(`u0',`%l0') +define(`u1',`%l2') +define(`u2',`%l4') +define(`u3',`%l6') +define(`v0',`%l1') +define(`v1',`%l3') +define(`v2',`%l5') +define(`v3',`%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_sub_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + sub u0,v0,%g1 C main sub + sub %g1,cy,%g5 C carry sub + orn u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: orn %g5,%g2,%g2 + andn u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + sub %g1,cy,%g5 + orn u1,v1,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + sub %g1,cy,%g5 + orn u2,v2,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + sub %g1,cy,%g5 + orn u3,v3,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + andn %g2,%g3,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + sub u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + sub %g1,cy,%g5 + orn u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + orn %g5,%g2,%g2 + andn u0,v0,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + sub %g1,cy,%g5 + orn u1,v1,%g2 + orn %g5,%g2,%g2 + andn u1,v1,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + sub %g1,cy,%g5 + orn u2,v2,%g2 + orn %g5,%g2,%g2 + andn u2,v2,%g3 + andn %g2,%g3,%g2 + add rp,32,rp + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + sub %g1,cy,%g5 + orn u3,v3,%g2 + orn %g5,%g2,%g2 + andn u3,v3,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + sub u0,v0,%g1 + orn u0,v0,%g2 + sub %g1,cy,%g5 + andn u0,v0,%g3 + orn %g5,%g2,%g2 + stx %g5,[rp-8] + andn %g2,%g3,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm new file mode 100644 index 0000000..0bdb566 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 18 +C UltraSPARC 3: 23 + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + +PROLOGUE(mpn_submul_1) + save %sp,-176,%sp + + sllx %i2, 3, %g2 + or %g0, %i1, %o1 + add %g2, 15, %o0 + or %g0, %i2, %o2 + and %o0, -16, %o0 + sub %sp, %o0, %sp + add %sp, 2223, %o0 + or %g0, %o0, %l0 + call mpn_mul_1 + or %g0, %i3, %o3 + or %g0, %o0, %l1 C preserve carry value from mpn_mul_1 + or %g0, %i0, %o0 + or %g0, %i0, %o1 + or %g0, %l0, %o2 + call mpn_sub_n + or %g0, %i2, %o3 + ret + restore %l1, %o0, %o0 C sum carry values +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h new file mode 100644 index 0000000..c88e680 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h @@ -0,0 +1,222 @@ +/* ultrasparc3/4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010, 2014, 2015 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 100 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 22 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 29 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 142 +#define MUL_TOOM6H_THRESHOLD 165 +#define MUL_TOOM8H_THRESHOLD 278 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 67 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 70 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 184 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 339 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 212, 5}, { 13, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 63, 8}, { 127, 7}, { 255, 9}, \ + { 67,10}, { 39, 9}, { 79, 8}, { 159, 7}, \ + { 319, 9}, { 83,10}, { 47, 9}, { 95, 8}, \ + { 191, 7}, { 383,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351,11}, \ + { 47,10}, { 95, 9}, { 191, 8}, { 383, 7}, \ + { 767,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 207, 9}, { 415,11}, \ + { 111,10}, { 223, 9}, { 479,12}, { 63,11}, \ + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 447,13}, { 63,12}, \ + { 127,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 703,12}, { 191,11}, { 383,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 287,11}, { 575,12}, \ + { 351,13}, { 191,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 767,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \ + { 1151,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2047,14}, { 1151,13}, \ + { 2303,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \ + { 1279,14}, { 2815,15}, { 1535,14}, { 3199,15}, \ + { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 171 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159,11}, \ + { 47, 9}, { 191, 8}, { 383, 7}, { 767, 9}, \ + { 207,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511,10}, { 135, 9}, { 271,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703, 7}, { 1407,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767,10}, { 207, 9}, { 415,10}, \ + { 223, 9}, { 447,12}, { 63,11}, { 127,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703, 8}, { 1407, 7}, { 2815,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,13}, { 63,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,12}, \ + { 223,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 415,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,10}, { 2815,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1215,14}, \ + { 639,13}, { 1279,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2303,15}, { 1279,14}, { 2815,15}, { 1535,14}, \ + { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 184 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 29 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 2 +#define SQRLO_DC_THRESHOLD 63 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 16 +#define DC_DIVAPPR_Q_THRESHOLD 64 +#define DC_BDIV_QR_THRESHOLD 30 +#define DC_BDIV_Q_THRESHOLD 86 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 17 +#define INV_APPR_THRESHOLD 15 + +#define BINV_NEWTON_THRESHOLD 109 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 117 + +#define MU_DIV_QR_THRESHOLD 618 +#define MU_DIVAPPR_Q_THRESHOLD 618 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 680 +#define MU_BDIV_Q_THRESHOLD 807 + +#define POWM_SEC_TABLE 3,22,102,579,1555 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 462 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 12 +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 1094 +#define GCD_DC_THRESHOLD 126 +#define GCDEXT_DC_THRESHOLD 132 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm new file mode 100644 index 0000000..954c7f6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 mpn_add_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: ? +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) + mov 0, cy +L(ent): cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + add n, -1, n + srlx %o4, 32, %g1 + srlx %o5, 32, %g2 + addccc %o4, %o5, %g3 + addccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g0, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm new file mode 100644 index 0000000..3134797 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_addlsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_addlsh1_n) + +MULFUNC_PROLOGUE(mpn_addlsh1_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm new file mode 100644 index 0000000..ee1afd0 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_addlsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_addlsh2_n) + +MULFUNC_PROLOGUE(mpn_addlsh2_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm new file mode 100644 index 0000000..5be9a0d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_addlshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + addccc %o4, %g4, %g3 + addccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm new file mode 100644 index 0000000..29dba96 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_addmul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 74 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + ldx [%g2+%g3], %o5 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + addcc %o5, %g5, %g5 + stx %g5, [%g2+%g3] + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h new file mode 100644 index 0000000..99db78a --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h @@ -0,0 +1,154 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1000 MHz ultrasparc t1 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 34 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 50 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 125 +#define MUL_TOOM8H_THRESHOLD 187 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 57 +#define SQR_TOOM4_THRESHOLD 133 +#define SQR_TOOM6_THRESHOLD 156 +#define SQR_TOOM8_THRESHOLD 260 + +#define MULMID_TOOM42_THRESHOLD 12 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 7 + +#define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \ + { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 9, 8}, \ + { 5, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \ + { 32, 7}, { 24, 8}, { 21, 9}, { 11, 8}, \ + { 23,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 43,10}, { 23,11}, { 15,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 67,10}, { 39, 9}, \ + { 79, 8}, { 159,10}, { 47, 9}, { 95,11}, \ + { 2048,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 53 +#define MUL_FFT_THRESHOLD 1728 + + +#define SQR_FFT_MODF_THRESHOLD 148 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 148, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \ + { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \ + { 7, 7}, { 16, 8}, { 9, 6}, { 38, 7}, \ + { 20, 8}, { 11, 7}, { 24, 8}, { 13, 9}, \ + { 7, 7}, { 30, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47, 8}, { 95, 9}, \ + { 51,11}, { 15,10}, { 31, 8}, { 127,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 2048,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 58 +#define SQR_FFT_THRESHOLD 1344 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 28 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 106 +#define DC_BDIV_QR_THRESHOLD 27 +#define DC_BDIV_Q_THRESHOLD 62 + +#define INV_MULMOD_BNM1_THRESHOLD 14 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 117 + +#define BINV_NEWTON_THRESHOLD 166 +#define REDC_1_TO_REDC_N_THRESHOLD 31 + +#define MU_DIV_QR_THRESHOLD 734 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_BDIV_QR_THRESHOLD 562 +#define MU_BDIV_Q_THRESHOLD 734 + +#define POWM_SEC_TABLE 4,29,188,643,2741 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 58 +#define HGCD_APPR_THRESHOLD 55 +#define HGCD_REDUCE_THRESHOLD 637 +#define GCD_DC_THRESHOLD 186 +#define GCDEXT_DC_THRESHOLD 140 +#define JACOBI_BASE_METHOD 3 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 268 +#define SET_STR_PRECOMPUTE_THRESHOLD 960 + +#define FAC_DSC_THRESHOLD 268 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm new file mode 100644 index 0000000..1fea2a1 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm @@ -0,0 +1,82 @@ +dnl SPARC v9 mpn_mul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 68 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + stx %g5, [%g2+%g3] + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm new file mode 100644 index 0000000..51bd4ab --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_rsblsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_rsblsh1_n) + +MULFUNC_PROLOGUE(mpn_rsblsh1_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm new file mode 100644 index 0000000..f0d208e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_rsblsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_rsblsh2_n) + +MULFUNC_PROLOGUE(mpn_rsblsh2_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm new file mode 100644 index 0000000..7c03e9f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_rsblshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + subccc %g4, %o4, %g3 + subccc %g2, %g1, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + subc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm new file mode 100644 index 0000000..c2af89f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 mpn_sub_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: ? +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_sub_n) + mov 0, cy +L(ent): cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + add n, -1, n + srlx %o4, 32, %g1 + srlx %o5, 32, %g2 + subccc %o4, %o5, %g3 + subccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g0, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm new file mode 100644 index 0000000..8c8fa80 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_sublsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm new file mode 100644 index 0000000..2fd5eee --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_sublsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm new file mode 100644 index 0000000..01eafef --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_sublshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + subccc %o4, %g4, %g3 + subccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm new file mode 100644 index 0000000..4f553a8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_submul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 74 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + ldx [%g2+%g3], %o5 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + subcc %o5, %g5, %g5 + stx %g5, [%g2+%g3] + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm new file mode 100644 index 0000000..0170746 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm @@ -0,0 +1,126 @@ +dnl SPARC v9 mpn_add_n for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l2') +define(`u1_off', `%l3') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + save %sp, -176, %sp + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) + save %sp, -176, %sp + + mov 0, cy +L(ent): + subcc n, 1, n + be L(final_one) + cmp %g0, cy + + ldx [up + 0], %o4 + sllx n, 3, tmp + + ldx [vp + 0], %o5 + add up, tmp, u0_off + + ldx [up + 8], %g5 + neg tmp, loop_n + + ldx [vp + 8], %g1 + add u0_off, 8, u1_off + + sub loop_n, -(2 * 8), loop_n + + brgez,pn loop_n, L(loop_tail) + add vp, (2 * 8), vp + + b,a L(top) + ALIGN(16) +L(top): + addxccc(%o4, %o5, tmp) + ldx [vp + 0], %o5 + + add rp, (2 * 8), rp + ldx [loop_n + u0_off], %o4 + + add vp, (2 * 8), vp + stx tmp, [rp - 16] + + addxccc(%g1, %g5, tmp) + ldx [vp - 8], %g1 + + ldx [loop_n + u1_off], %g5 + sub loop_n, -(2 * 8), loop_n + + brlz loop_n, L(top) + stx tmp, [rp - 8] + +L(loop_tail): + addxccc(%o4, %o5, %g3) + add loop_n, u0_off, up + + addxccc(%g1, %g5, %g5) + stx %g3, [rp + 0] + + brgz,pt loop_n, L(done) + stx %g5, [rp + 8] + + add rp, (2 * 8), rp +L(final_one): + ldx [up+0], %o4 + ldx [vp+0], %o5 + addxccc(%o4, %o5, %g3) + stx %g3, [rp+0] + +L(done): + addxc(%g0, %g0, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm new file mode 100644 index 0000000..939811e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm @@ -0,0 +1,182 @@ +dnl SPARC v9 mpn_addmul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 26 +C UltraSPARC T4: 4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +define(`u0', `%l0') +define(`u1', `%l1') +define(`u2', `%l2') +define(`u3', `%l3') +define(`r0', `%l4') +define(`r1', `%l5') +define(`r2', `%l6') +define(`r3', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) + save %sp, -176, %sp + ldx [up+0], %g1 + + and n, 3, %g3 + brz %g3, L(b0) + addcc %g0, %g0, %g5 C clear carry limb, flag + cmp %g3, 2 + bcs %xcc, L(b01) + nop + be %xcc, L(b10) + ldx [up+8], %g5 + +L(b11): ldx [up+16], u3 + mulx %g1, v0, %o2 + umulxhi(%g1, v0, %o3) + ldx [rp+0], r1 + mulx %g5, v0, %o4 + ldx [rp+8], r2 + umulxhi(%g5, v0, %o5) + ldx [rp+16], r3 + mulx u3, v0, %g4 + umulxhi(u3, v0, %g5) + addcc %o3, %o4, %o4 + addxccc(%o5, %g4, %g4) + addxc( %g0, %g5, %g5) + addcc r1, %o2, r1 + stx r1, [rp+0] + addxccc(r2, %o4, r2) + stx r2, [rp+8] + addxccc(r3, %g4, r3) + stx r3, [rp+16] + add n, -3, n + add up, 24, up + brz n, L(xit) + add rp, 24, rp + b L(com) + nop + +L(b10): mulx %g1, v0, %o4 + ldx [rp+0], r2 + umulxhi(%g1, v0, %o5) + ldx [rp+8], r3 + mulx %g5, v0, %g4 + umulxhi(%g5, v0, %g5) + addcc %o5, %g4, %g4 + addxc( %g0, %g5, %g5) + addcc r2, %o4, r2 + stx r2, [rp+0] + addxccc(r3, %g4, r3) + stx r3, [rp+8] + add n, -2, n + add up, 16, up + brz n, L(xit) + add rp, 16, rp + b L(com) + nop + +L(b01): ldx [rp+0], r3 + mulx %g1, v0, %g4 + umulxhi(%g1, v0, %g5) + addcc r3, %g4, r3 + stx r3, [rp+0] + add n, -1, n + add up, 8, up + brz n, L(xit) + add rp, 8, rp + +L(com): ldx [up+0], %g1 +L(b0): ldx [up+8], u1 + ldx [up+16], u2 + ldx [up+24], u3 + mulx %g1, v0, %o0 + umulxhi(%g1, v0, %o1) + b L(lo0) + nop + + ALIGN(16) +L(top): ldx [up+0], u0 + addxc( %g0, %g5, %g5) C propagate carry into carry limb + ldx [up+8], u1 + addcc r0, %o0, r0 + ldx [up+16], u2 + addxccc(r1, %o2, r1) + ldx [up+24], u3 + addxccc(r2, %o4, r2) + stx r0, [rp-32] + addxccc(r3, %g4, r3) + stx r1, [rp-24] + mulx u0, v0, %o0 + stx r2, [rp-16] + umulxhi(u0, v0, %o1) + stx r3, [rp-8] +L(lo0): mulx u1, v0, %o2 + ldx [rp+0], r0 + umulxhi(u1, v0, %o3) + ldx [rp+8], r1 + mulx u2, v0, %o4 + ldx [rp+16], r2 + umulxhi(u2, v0, %o5) + ldx [rp+24], r3 + mulx u3, v0, %g4 + addxccc(%g5, %o0, %o0) + umulxhi(u3, v0, %g5) + add up, 32, up + addxccc(%o1, %o2, %o2) + add rp, 32, rp + addxccc(%o3, %o4, %o4) + add n, -4, n + addxccc(%o5, %g4, %g4) + brgz n, L(top) + nop + + addxc( %g0, %g5, %g5) + addcc r0, %o0, r0 + stx r0, [rp-32] + addxccc(r1, %o2, r1) + stx r1, [rp-24] + addxccc(r2, %o4, r2) + stx r2, [rp-16] + addxccc(r3, %g4, r3) + stx r3, [rp-8] +L(xit): addxc( %g0, %g5, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm new file mode 100644 index 0000000..ccc6a44 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm @@ -0,0 +1,228 @@ +dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb +C mul_2 addmul_2 +C UltraSPARC T3: 22.5 23.5 +C UltraSPARC T4: 3.25 3.75 + + +C The code is reasonably scheduled but also relies on OoO. There was hope that +C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per +C iteration needs to be removed. +C +C We could almost use 2-way unrolling, but currently the wN registers live too +C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down- +C wards, 2-way unrolling should become possible. With n-indexed addressing it +C should run no slower. +C +C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could +C be postponed a full way, and then just one register could be used. + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`vp', `%i3') + +define(`v0', `%o0') +define(`v1', `%o1') + +define(`w0', `%o2') +define(`w1', `%o3') +define(`w2', `%o4') +define(`w3', `%o5') + +ifdef(`OPERATION_mul_2',` + define(`AM2', `') + define(`ADDX', `addcc`'$1') + define(`func', `mpn_mul_2') +') +ifdef(`OPERATION_addmul_2',` + define(`AM2', `$1') + define(`ADDX', `addxccc($1,$2,$3)') + define(`func', `mpn_addmul_2') +') + + +MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + ldx [vp+0], v0 C load v0 + and n, 3, %g5 + ldx [vp+8], v1 C load v1 + add n, -6, n + ldx [up+0], %g4 + brz %g5, L(b0) + cmp %g5, 2 + bcs L(b1) + nop + be L(b2) + nop + +L(b3): +AM2(` ldx [rp+0], %g1') + mulx %g4, v0, w2 + umulxhi(%g4, v0, w3) + ldx [up+8], %i5 + mulx %g4, v1, %l3 + umulxhi(%g4, v1, %l7) +AM2(` ldx [rp+8], %g3') + add up, -8, up + add rp, -8, rp + b L(lo3) + mov 0, w0 + +L(b2): +AM2(` ldx [rp+0], %g3') + mulx %g4, v0, w3 + umulxhi(%g4, v0, w0) + ldx [up+8], %i4 + mulx %g4, v1, %l1 + umulxhi(%g4, v1, %l5) +AM2(` ldx [rp+8], %g1') + add rp, 16, rp + brlz n, L(end) + mov 0, w1 + ba L(top) + add up, 16, up + +L(b1): +AM2(` ldx [rp+0], %g1') + mulx %g4, v0, w0 + umulxhi(%g4, v0, w1) + ldx [up+8], %i5 + mulx %g4, v1, %l3 + umulxhi(%g4, v1, %l7) +AM2(` ldx [rp+8], %g3') + add up, 8, up + add rp, 8, rp + b L(lo1) + mov 0, w2 + +L(b0): +AM2(` ldx [rp+0], %g3') + mulx %g4, v0, w1 + umulxhi(%g4, v0, w2) + ldx [up+8], %i4 + mulx %g4, v1, %l1 + umulxhi(%g4, v1, %l5) +AM2(` ldx [rp+8], %g1') + b L(lo0) + mov 0, w3 + + ALIGN(16) C cycle +L(top): mulx %i4, v0, %l2 C 0->5 + umulxhi(%i4, v0, %l6) C 0->5 + ldx [up+0], %i5 C 1->6 +AM2(` addcc w3, %g3, w3') C 1 + stx w3, [rp-16] C 2 + ADDX(` %l1, w0, w0') C 2 + addxccc(%l5, w1, w1) C 3 + mulx %i4, v1, %l3 C 3->9 + umulxhi(%i4, v1, %l7) C 4->9 +AM2(` ldx [rp+0], %g3') C 4 + addcc %l2, w0, w0 C 5 + addxccc(%l6, w1, w1) C 5 + addxc( %g0, %g0, w2) C 6 +L(lo1): mulx %i5, v0, %l0 C 6 + umulxhi(%i5, v0, %l4) C 7 + ldx [up+8], %i4 C 7 +AM2(` addcc w0, %g1, w0') C 8 + stx w0, [rp-8] C 8 + ADDX(` %l3, w1, w1') C 9 + addxccc(%l7, w2, w2) C 9 + mulx %i5, v1, %l1 C 10 + umulxhi(%i5, v1, %l5) C 10 +AM2(` ldx [rp+8], %g1') C 11 + addcc %l0, w1, w1 C 11 + addxccc(%l4, w2, w2) C 12 + addxc( %g0, %g0, w3) C 12 +L(lo0): mulx %i4, v0, %l2 C 13 + umulxhi(%i4, v0, %l6) C 13 + ldx [up+16], %i5 C 14 +AM2(` addcc w1, %g3, w1') C 14 + stx w1, [rp+0] C 15 + ADDX(` %l1, w2, w2') C 15 + addxccc(%l5, w3, w3) C 16 + mulx %i4, v1, %l3 C 16 + umulxhi(%i4, v1, %l7) C 17 +AM2(` ldx [rp+16], %g3') C 17 + addcc %l2, w2, w2 C 18 + addxccc(%l6, w3, w3) C 18 + addxc( %g0, %g0, w0) C 19 +L(lo3): mulx %i5, v0, %l0 C 19 + umulxhi(%i5, v0, %l4) C 20 + ldx [up+24], %i4 C 20 +AM2(` addcc w2, %g1, w2') C 21 + stx w2, [rp+8] C 21 + ADDX(` %l3, w3, w3') C 22 + addxccc(%l7, w0, w0) C 22 + mulx %i5, v1, %l1 C 23 + umulxhi(%i5, v1, %l5) C 23 +AM2(` ldx [rp+24], %g1') C 24 + addcc %l0, w3, w3 C 24 + addxccc(%l4, w0, w0) C 25 + addxc( %g0, %g0, w1) C 25 + add up, 32, up + add rp, 32, rp + brgz n, L(top) + add n, -4, n + +L(end): mulx %i4, v0, %l2 + umulxhi(%i4, v0, %l6) +AM2(` addcc w3, %g3, w3') + stx w3, [rp-16] + ADDX(` %l1, w0, w0') + addxccc(%l5, w1, w1) + mulx %i4, v1, %l3 + umulxhi(%i4, v1, %l7) + addcc %l2, w0, w0 + addxccc(%l6, w1, w1) + addxc( %g0, %g0, w2) +AM2(` addcc w0, %g1, w0') + stx w0, [rp-8] + ADDX(` %l3, w1, w1') + stx w1, [rp+0] + addxc(%l7, w2, %i0) + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm new file mode 100644 index 0000000..845f6d6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm @@ -0,0 +1,219 @@ +dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb +C mul_4 addmul_4 +C UltraSPARC T3: 21.5 22.0 +C UltraSPARC T4: 2.625 2.75 + + +C The code is well-scheduled and relies on OoO very little. There is hope that +C this will run at around 2.5 and 2.75 c/l respectively, on T4. + +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`vp', `%i3') + +define(`v0', `%g1') +define(`v1', `%o7') +define(`v2', `%g2') +define(`v3', `%i3') + +define(`w0', `%o0') +define(`w1', `%o1') +define(`w2', `%o2') +define(`w3', `%o3') +define(`w4', `%o4') + +define(`r0', `%o5') + +define(`u0', `%i4') +define(`u1', `%i5') + +define(`rp0', `rp') +define(`rp1', `%g3') +define(`rp2', `%g4') +define(`up0', `up') +define(`up1', `%g5') + +ifdef(`OPERATION_mul_4',` + define(`AM4', `') + define(`ADDX', `addcc`'$1') + define(`func', `mpn_mul_4') +') +ifdef(`OPERATION_addmul_4',` + define(`AM4', `$1') + define(`ADDX', `addxccc($1,$2,$3)') + define(`func', `mpn_addmul_4') +') + + +MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + ldx [up + 0], u1 C load up[0] early + andcc n, 1, %g0 C is n odd? + ldx [vp + 0], v0 + sllx n, 3, n + ldx [vp + 8], v1 + add n, -28, n + ldx [vp + 16], v2 + add rp, -16, rp + ldx [vp + 24], v3 + add up, n, up0 + add rp, n, rp0 + add up0, 8, up1 + add rp0, 8, rp1 + add rp0, 16, rp2 + mulx u1, v0, %l0 + mov 0, w0 + mulx u1, v1, %l1 + mov 0, w1 + mulx u1, v2, %l2 + mov 0, w2 + mulx u1, v3, %l3 + mov 0, w3 + + be L(evn) + neg n, n + +L(odd): mov u1, u0 + ldx [up1 + n], u1 +AM4(` ldx [rp2 + n], r0') + umulxhi(u0, v0, %l4) + umulxhi(u0, v1, %l5) + umulxhi(u0, v2, %l6) + umulxhi(u0, v3, %l7) + b L(mid) + add n, 8, n + +L(evn): ldx [up1 + n], u0 +AM4(` ldx [rp2 + n], r0') + umulxhi(u1, v0, %l4) + umulxhi(u1, v1, %l5) + umulxhi(u1, v2, %l6) + umulxhi(u1, v3, %l7) + add n, 16, n + + ALIGN(16) +L(top): addcc %l0, w0, w0 + mulx u0, v0, %l0 C w 0 + addxccc(%l1, w1, w1) + mulx u0, v1, %l1 C w 1 + addxccc(%l2, w2, w2) + mulx u0, v2, %l2 C w 2 + addxccc(%l3, w3, w3) + mulx u0, v3, %l3 C w 3 + ldx [up0 + n], u1 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp0 + n] + ADDX(` %l4, w1, w0') + umulxhi(u0, v0, %l4) C w 1 +AM4(` ldx [rp1 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u0, v1, %l5) C w 2 + addxccc(%l6, w3, w2) + umulxhi(u0, v2, %l6) C w 3 + addxc( %l7, w4, w3) + umulxhi(u0, v3, %l7) C w 4 +L(mid): addcc %l0, w0, w0 + mulx u1, v0, %l0 C w 1 + addxccc(%l1, w1, w1) + mulx u1, v1, %l1 C w 2 + addxccc(%l2, w2, w2) + mulx u1, v2, %l2 C w 3 + addxccc(%l3, w3, w3) + mulx u1, v3, %l3 C w 4 + ldx [up1 + n], u0 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp1 + n] + ADDX(` %l4, w1, w0') + umulxhi(u1, v0, %l4) C w 2 +AM4(` ldx [rp2 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u1, v1, %l5) C w 3 + addxccc(%l6, w3, w2) + umulxhi(u1, v2, %l6) C w 4 + addxc( %l7, w4, w3) + umulxhi(u1, v3, %l7) C w 5 + brlz n, L(top) + add n, 16, n + +L(end): addcc %l0, w0, w0 + mulx u0, v0, %l0 + addxccc(%l1, w1, w1) + mulx u0, v1, %l1 + addxccc(%l2, w2, w2) + mulx u0, v2, %l2 + addxccc(%l3, w3, w3) + mulx u0, v3, %l3 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp0 + n] + ADDX(` %l4, w1, w0') + umulxhi(u0, v0, %l4) +AM4(` ldx [rp1 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u0, v1, %l5) + addxccc(%l6, w3, w2) + umulxhi(u0, v2, %l6) + addxc( %l7, w4, w3) + umulxhi(u0, v3, %l7) + addcc %l0, w0, w0 + addxccc(%l1, w1, w1) + addxccc(%l2, w2, w2) + addxccc(%l3, w3, w3) + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp1 + n] + ADDX(` %l4, w1, w0') + addxccc(%l5, w2, w1) + addxccc(%l6, w3, w2) + stx w0, [rp2 + n] + add n, 16, n + stx w1, [rp1 + n] + stx w2, [rp2 + n] + addxc( %l7, w4, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm new file mode 100644 index 0000000..1014b1b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm @@ -0,0 +1,147 @@ +dnl SPARC v9 mpn_addlsh_n and mpn_sublsh_n for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 11 +C UltraSPARC T4: 4 + +C For sublsh_n we combine the two shifted limbs using xnor, using the identity +C (a xor not b) = (not (a xor b)) which equals (not (a or b)) when (a and b) = +C 0 as it is in our usage. This gives us the ones complement for free. +C Unfortunately, the same trick will not work for rsblsh_n, which will instead +C require a separate negation. +C +C FIXME: Add rsblsh_n to this file. + +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cnt',`%i4') + +define(`tnc',`%o5') + +ifdef(`OPERATION_addlsh_n',` + define(`INITCY', `subcc %g0, 0, %g0') + define(`MERGE', `or') + define(`func', `mpn_addlsh_n') +') +ifdef(`OPERATION_sublsh_n',` + define(`INITCY', `subcc %g0, 1, %g0') + define(`MERGE', `xnor') + define(`func', `mpn_sublsh_n') +') + +define(`rp0', `rp') +define(`rp1', `%o2') +define(`up0', `up') +define(`up1', `%o3') +define(`vp0', `vp') +define(`vp1', `%o4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_sublsh_n) +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + mov 64, tnc + sub tnc, cnt, tnc + + andcc n, 1, %g0 + sllx n, 3, n + add n, -16, n + add up, n, up0 + add vp, n, vp0 + add rp, n, rp0 + add up0, 8, up1 + add vp0, 8, vp1 + add rp0, -8, rp1 + add rp0, -16, rp0 + neg n, n + be L(evn) + INITCY + +L(odd): ldx [vp0 + n], %l1 + mov 0, %l2 + ldx [up0 + n], %l5 + sllx %l1, cnt, %g3 + brgez n, L(wd1) + add n, 8, n + ldx [vp0 + n], %l0 + b L(lo1) + sllx %l1, cnt, %g3 + +L(evn): ldx [vp0 + n], %l0 + mov 0, %l3 + ldx [up0 + n], %l4 + ldx [vp1 + n], %l1 + b L(lo0) + sllx %l0, cnt, %g1 + +L(top): addxccc(%l6, %l4, %o0) + ldx [vp0 + n], %l0 + sllx %l1, cnt, %g3 + stx %o0, [rp0 + n] +L(lo1): srlx %l1, tnc, %l3 + MERGE %l2, %g3, %l7 + ldx [up0 + n], %l4 + addxccc(%l7, %l5, %o1) + ldx [vp1 + n], %l1 + sllx %l0, cnt, %g1 + stx %o1, [rp1 + n] +L(lo0): srlx %l0, tnc, %l2 + MERGE %l3, %g1, %l6 + ldx [up1 + n], %l5 + brlz,pt n, L(top) + add n, 16, n + + addxccc(%l6, %l4, %o0) + sllx %l1, cnt, %g3 + stx %o0, [rp0 + n] +L(wd1): srlx %l1, tnc, %l3 + MERGE %l2, %g3, %l7 + addxccc(%l7, %l5, %o1) + stx %o1, [rp1 + n] + +ifdef(`OPERATION_addlsh_n', +` addxc( %l3, %g0, %i0)') +ifdef(`OPERATION_sublsh_n', +` addxc( %g0, %g0, %g1) + add %g1, -1, %g1 + sub %l3, %g1, %i0') + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm new file mode 100644 index 0000000..550860d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm @@ -0,0 +1,147 @@ +dnl SPARC T3/T4/T5 mpn_bdiv_dbm1c. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 25 +C UltraSPARC T4/T5: 4 + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`bd', `%i3') +define(`h', `%i4') + +define(`plo0',`%g4') define(`plo1',`%g5') +define(`phi0',`%l0') define(`phi1',`%l1') +define(`a0', `%g1') define(`a1', `%g3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_dbm1c) + save %sp, -176, %sp + + and n, 3, %g5 + ldx [ap + 0], %g2 + add n, -5, n + brz %g5, L(b0) + cmp %g5, 2 + bcs %xcc, L(b1) + nop + be %xcc, L(b2) + nop + +L(b3): ldx [ap + 8], a0 + mulx bd, %g2, plo1 + umulxhi(bd, %g2, phi1) + ldx [ap + 16], a1 + add qp, -24, qp + b L(lo3) + add ap, -8, ap + +L(b2): ldx [ap + 8], a1 + mulx bd, %g2, plo0 + umulxhi(bd, %g2, phi0) + brlz,pt n, L(wd2) + nop +L(gt2): ldx [ap + 16], a0 + add ap, 16, ap + b L(lo2) + add n, -1, n + +L(b1): mulx bd, %g2, plo1 + umulxhi(bd, %g2, phi1) + brlz,pn n, L(wd1) + add qp, -8, qp +L(gt1): ldx [ap + 8], a0 + ldx [ap + 16], a1 + b L(lo1) + add ap, 8, ap + +L(b0): ldx [ap + 8], a1 + mulx bd, %g2, plo0 + umulxhi(bd, %g2, phi0) + ldx [ap + 16], a0 + b L(lo0) + add qp, -16, qp + +L(top): ldx [ap + 0], a0 + sub h, phi1, h +L(lo2): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 0] + ldx [ap + 8], a1 + sub h, phi0, h +L(lo1): mulx bd, a0, plo0 + umulxhi(bd, a0, phi0) + subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 8] + ldx [ap + 16], a0 + sub h, phi1, h +L(lo0): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 16] + ldx [ap + 24], a1 + sub h, phi0, h +L(lo3): mulx bd, a0, plo0 + umulxhi(bd, a0, phi0) + subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 24] + add ap, 32, ap + add qp, 32, qp + brgz,pt n, L(top) + add n, -4, n + +L(end): sub h, phi1, h +L(wd2): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 0] + sub h, phi0, h +L(wd1): subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 8] + sub h, phi1, %i0 + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm new file mode 100644 index 0000000..9847047 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm @@ -0,0 +1,137 @@ +dnl SPARC T3/T4/T5 mpn_bdiv_q_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 31 +C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`d', `%i3') +define(`dinv',`%i4') +define(`cnt', `%i5') + +define(`tnc', `%o2') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + add d, -1, %g1 + andn %g1, d, %g1 + popc %g1, cnt + + srlx d, cnt, d + srlx d, 1, %g1 + and %g1, 127, %g1 + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o7 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + add n, -2, n + brz,pt cnt, L(norm) + sub %o7, %g1, dinv + + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + b L(eee) + mov 0, %g4 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + + brz,pt cnt, L(norm) + add n, -2, n + +L(unorm): + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + mov 0, %g4 +L(eee): sub %g0, cnt, tnc + +L(tpu): ldx [ap+8], %g3 + add ap, 8, ap + sllx %g3, tnc, %g5 + or %g5, %o5, %g5 + srlx %g3, cnt, %o5 + subcc %g5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + umulxhi(d, %g1, %g1) + addxc( %g1, %g0, %g4) + brgz,pt n, L(tpu) + add n, -1, n + + sub %o5, %g4, %o5 +L(edu): mulx %o5, dinv, %g1 + return %i7+8 + stx %g1, [%o0] + +L(norm): + mulx dinv, %o5, %g1 + brlz,pt n, L(edn) + stx %g1, [qp] + add qp, 8, qp + addcc %g0, 0, %g4 + +L(tpn): umulxhi(d, %g1, %g1) + ldx [ap+8], %g5 + add ap, 8, ap + addxc( %g1, %g0, %g1) + subcc %g5, %g1, %g1 + mulx %g1, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + brgz,pt n, L(tpn) + add n, -1, n + +L(edn): return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm new file mode 100644 index 0000000..49ccaec --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm @@ -0,0 +1,145 @@ +dnl SPARC v9 mpn_cnd_add_n and mpn_cnd_sub_n for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8.5 +C UltraSPARC T4: 3 + +C We use a double-pointer trick to allow indexed addressing. Its setup +C cost might be a problem in these functions, since we don't expect huge n +C arguments. +C +C For sub we need ~(a & mask) = (~a | ~mask) but by complementing mask we can +C instead do ~(a & ~mask) = (~a | mask), allowing us to use the orn insn. + +C INPUT PARAMETERS +define(`cnd', `%i0') +define(`rp', `%i1') +define(`up', `%i2') +define(`vp', `%i3') +define(`n', `%i4') + +define(`mask', `cnd') +define(`up0', `%l0') define(`up1', `%l1') +define(`vp0', `%l2') define(`vp1', `%l3') +define(`rp0', `%g4') define(`rp1', `%g5') +define(`u0', `%l4') define(`u1', `%l5') +define(`v0', `%l6') define(`v1', `%l7') +define(`x0', `%g1') define(`x1', `%g3') +define(`w0', `%g1') define(`w1', `%g3') + +ifdef(`OPERATION_cnd_add_n',` + define(`LOGOP', `and $1, $2, $3') + define(`MAKEMASK',`cmp %g0, $1 + addxc( %g0, %g0, $2) + neg $2, $2') + define(`INITCY', `addcc %g0, 0, %g0') + define(`RETVAL', `addxc( %g0, %g0, %i0)') + define(`func', `mpn_cnd_add_n') +') +ifdef(`OPERATION_cnd_sub_n',` + define(`LOGOP', `orn $2, $1, $3') + define(`MAKEMASK',`cmp $1, 1 + addxc( %g0, %g0, $2) + neg $2, $2') + define(`INITCY', `subcc %g0, 1, %g0') + define(`RETVAL', `addxc( %g0, %g0, %i0) + xor %i0, 1, %i0') + define(`func', `mpn_cnd_sub_n') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + MAKEMASK(cnd,mask) + + andcc n, 1, %g0 + sllx n, 3, n + add n, -16, n + add vp, n, vp0 + add up, n, up0 + add rp, n, rp0 + neg n, n + be L(evn) + INITCY + +L(odd): ldx [vp0 + n], v1 + ldx [up0 + n], u1 + LOGOP( v1, mask, x1) + addxccc(u1, x1, w1) + stx w1, [rp0 + n] + add n, 8, n + brgz n, L(rtn) + nop + +L(evn): add vp0, 8, vp1 + add up0, 8, up1 + add rp0, -24, rp1 + ldx [vp0 + n], v0 + ldx [vp1 + n], v1 + ldx [up0 + n], u0 + ldx [up1 + n], u1 + add n, 16, n + brgz n, L(end) + add rp0, -16, rp0 + +L(top): LOGOP( v0, mask, x0) + ldx [vp0 + n], v0 + LOGOP( v1, mask, x1) + ldx [vp1 + n], v1 + addxccc(u0, x0, w0) + ldx [up0 + n], u0 + addxccc(u1, x1, w1) + ldx [up1 + n], u1 + stx w0, [rp0 + n] + add n, 16, n + brlez n, L(top) + stx w1, [rp1 + n] + +L(end): LOGOP( v0, mask, x0) + LOGOP( v1, mask, x1) + addxccc(u0, x0, w0) + addxccc(u1, x1, w1) + stx w0, [rp0 + n] + stx w1, [rp1 + 32] + +L(rtn): RETVAL + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm new file mode 100644 index 0000000..d7dbdf9 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm @@ -0,0 +1,129 @@ +dnl SPARC T3/T4/T5 mpn_divexact_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 31 +C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`d', `%i3') + +define(`dinv',`%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_divexact_1) + save %sp, -176, %sp + cmp n, 1 + bne,pt %xcc, L(gt1) + ldx [ap], %o5 + udivx %o5, d, %g1 + stx %g1, [qp] + return %i7+8 + nop + +L(gt1): add d, -1, %g1 + andn %g1, d, %g1 + popc %g1, %i4 C i4 = count_trailing_zeros(d) + + srlx d, %i4, d + srlx d, 1, %g1 + and %g1, 127, %g1 + + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o7 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + add n, -2, n + brz,pt %i4, L(norm) + sub %o7, %g1, dinv + +L(unnorm): + mov 0, %g4 + sub %g0, %i4, %o2 + srlx %o5, %i4, %o5 +L(top_unnorm): + ldx [ap+8], %g3 + add ap, 8, ap + sllx %g3, %o2, %g5 + or %g5, %o5, %g5 + srlx %g3, %i4, %o5 + subcc %g5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + umulxhi(d, %g1, %g1) + addxc( %g1, %g0, %g4) + brgz,pt n, L(top_unnorm) + add n, -1, n + + sub %o5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + return %i7+8 + nop + +L(norm): + mulx dinv, %o5, %g1 + stx %g1, [qp] + add qp, 8, qp + addcc %g0, 0, %g4 +L(top_norm): + umulxhi(d, %g1, %g1) + ldx [ap+8], %g5 + add ap, 8, ap + addxc( %g1, %g0, %g1) + subcc %g5, %g1, %g1 + mulx %g1, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + brgz,pt n, L(top_norm) + add n, -1, n + + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm new file mode 100644 index 0000000..20ed8bf --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm @@ -0,0 +1,78 @@ +dnl SPARC v9 mpn_hamdist for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 18 +C UltraSPARC T4: 3.5 + +C INPUT PARAMETERS +define(`up', `%o0') +define(`vp', `%o1') +define(`n', `%o2') +define(`pcnt', `%o5') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_hamdist) + subcc n, 1, n + be L(final_one) + clr pcnt +L(top): + ldx [up + 0], %g1 + ldx [vp + 0], %g2 + ldx [up + 8], %o4 + ldx [vp + 8], %g3 + sub n, 2, n + xor %g1, %g2, %g1 + add up, 16, up + popc %g1, %g2 + add vp, 16, vp + xor %o4, %g3, %o4 + add pcnt, %g2, pcnt + popc %o4, %g3 + brgz n, L(top) + add pcnt, %g3, pcnt + brlz,pt n, L(done) + nop +L(final_one): + ldx [up + 0], %g1 + ldx [vp + 0], %g2 + xor %g1,%g2, %g1 + popc %g1, %g2 + add pcnt, %g2, pcnt +L(done): + retl + mov pcnt, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm new file mode 100644 index 0000000..4da49cf --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm @@ -0,0 +1,92 @@ +dnl SPARC T3/T4/T5 mpn_invert_limb. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: ? +C UltraSPARC T4/T5: ? + +C INPUT PARAMETERS +define(`d', `%o0') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_invert_limb) + srlx d, 54, %g1 + LEA64(approx_tab, g2, g3) + and %g1, 0x1fe, %g1 + srlx d, 24, %g4 + lduh [%g2+%g1], %g3 + add %g4, 1, %g4 + sllx %g3, 11, %g2 + add %g2, -1, %g2 + mulx %g3, %g3, %g3 + mulx %g3, %g4, %g3 + srlx %g3, 40, %g3 + sub %g2, %g3, %g2 + sllx %g2, 60, %g1 + mulx %g2, %g2, %g3 + mulx %g3, %g4, %g4 + sub %g1, %g4, %g1 + srlx %g1, 47, %g1 + sllx %g2, 13, %g2 + add %g1, %g2, %g1 + and d, 1, %g2 + srlx %g1, 1, %g4 + sub %g0, %g2, %g3 + and %g4, %g3, %g3 + srlx d, 1, %g4 + add %g4, %g2, %g2 + mulx %g1, %g2, %g2 + sub %g3, %g2, %g2 + umulxhi(%g1, %g2, %g2) + srlx %g2, 1, %g2 + sllx %g1, 31, %g1 + add %g2, %g1, %g1 + mulx %g1, d, %g3 + umulxhi(d, %g1, %g4) + addcc %g3, d, %g0 + addxc( %g4, d, %o0) + jmp %o7+8 + sub %g1, %o0, %o0 +EPILOGUE() + + RODATA + ALIGN(2) + TYPE( approx_tab, object) + SIZE( approx_tab, 512) +approx_tab: +forloop(i,256,512-1,dnl +` .half eval(0x7fd00/i) +')dnl diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm new file mode 100644 index 0000000..c79032d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm @@ -0,0 +1,77 @@ +dnl SPARC v9-2011 simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(__gmpn_umulh) + save %sp, -176, %sp + ldx [%sp+2047+176+256], %o0 + ldx [%sp+2047+176+256+8], %o1 + rd %ccr, %o4 + srl %o0, 0, %l4 + srl %o1, 0, %l1 + srlx %o1, 32, %o1 + mulx %o1, %l4, %l2 + srlx %o0, 32, %o0 + mulx %o0, %l1, %l3 + mulx %l1, %l4, %l1 + srlx %l1, 32, %l1 + add %l2, %l1, %l2 + addcc %l2, %l3, %l2 + mulx %o1, %o0, %o1 + mov 0, %l1 + movcs %xcc, 1, %l1 + sllx %l1, 32, %l1 + add %o1, %l1, %o1 + srlx %l2, 32, %o0 + add %o1, %o0, %o0 + stx %o0, [%sp+2047+176+256] + wr %o4, 0, %ccr + ret + restore +EPILOGUE() + +PROLOGUE(__gmpn_lzcnt) + save %sp, -176, %sp + ldx [%sp+2047+176+256], %o0 + brz,a %o0, 2f + mov 64, %o1 + brlz %o0, 2f + mov 0, %o1 +1: sllx %o0, 1, %o0 + brgz %o0, 1b + add %o1, 1, %o1 + stx %o1, [%sp+2047+176+256] +2: ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 new file mode 100644 index 0000000..e5d6d8e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 @@ -0,0 +1,88 @@ +dnl SPARC v9-2011 simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage addxccc(r1,r2,r3, t1) +dnl 64-bit add with carry-in and carry-out +dnl FIXME: Register g2 must not be destination + +define(`addxccc',`dnl + add %sp, -512, %sp + stx %g2, [%sp+2047+256+16] + mov 0, %g2 + movcs %xcc, -1, %g2 + addcc %g2, 1, %g0 + addccc $1, $2, $3 + ldx [%sp+2047+256+16], %g2 + sub %sp, -512, %sp +') + + +dnl Usage addxc(r1,r2,r3, t1,t2) +dnl 64-bit add with carry-in + +define(`addxc',`dnl + bcc %xcc, 1f + add $1, $2, $3 + add $3, 1, $3 +1: +') + + +dnl Usage umulxhi(r1,r2,r3) +dnl 64-bit multiply returning upper 64 bits +dnl Calls __gmpn_umulh using a non-standard calling convention + +define(`umulxhi',`dnl + add %sp, -512, %sp + stx $1, [%sp+2047+256] + stx $2, [%sp+2047+256+8] + stx %o7, [%sp+2047+256+16] + call __gmpn_umulh + nop + ldx [%sp+2047+256+16], %o7 + ldx [%sp+2047+256], $3 + sub %sp, -512, %sp +') +dnl Usage lzcnt(r1,r2) +dnl Plain count leading zeros +dnl Calls __gmpn_lzcnt using a non-standard calling convention + +define(`lzcnt',`dnl + add %sp, -512, %sp + stx %o7, [%sp+2047+256+16] + call __gmpn_lzcnt + stx $1, [%sp+2047+256] + ldx [%sp+2047+256+16], %o7 + ldx [%sp+2047+256], $2 + sub %sp, -512, %sp +') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm new file mode 100644 index 0000000..08facbd --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm @@ -0,0 +1,233 @@ +dnl SPARC T3/T4/T5 mpn_mod_1s_4p. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 30 +C UltraSPARC T4/T5: 4 + +C INPUT PARAMETERS +define(`ap', `%o0') +define(`n', `%o1') +define(`d', `%o2') +define(`cps', `%o3') + + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_1s_4p) + save %sp, -176, %sp + ldx [%i3+16], %o4 + ldx [%i3+24], %o3 + ldx [%i3+32], %o2 + ldx [%i3+40], %o1 + ldx [%i3+48], %o0 + + and %i1, 3, %g3 + sllx %i1, 3, %g1 + add %i0, %g1, %i0 + brz %g3, L(b00) + cmp %g3, 2 + bcs %xcc, L(b01) + nop + be %xcc, L(b10) + nop + +L(b11): ldx [%i0-16], %g2 + mulx %g2, %o4, %g5 + umulxhi(%g2, %o4, %g3) + ldx [%i0-24], %g4 + addcc %g5, %g4, %g5 + addxc( %g3, %g0, %g4) + ldx [%i0-8], %g2 + mulx %g2, %o3, %g1 + umulxhi(%g2, %o3, %g3) + addcc %g1, %g5, %g1 + addxc( %g3, %g4, %g2) + ba,pt %xcc, .L8 + add %i0, -32, %i0 + +L(b00): ldx [%i0-24], %g3 + mulx %g3, %o4, %g2 + umulxhi(%g3, %o4, %g5) + ldx [%i0-32], %g4 + addcc %g2, %g4, %g2 + addxc( %g5, %g0, %g3) + ldx [%i0-16], %g4 + mulx %g4, %o3, %g5 + umulxhi(%g4, %o3, %i5) + addcc %g2, %g5, %g5 + addxc( %g3, %i5, %g4) + ldx [%i0-8], %g2 + mulx %g2, %o2, %g1 + umulxhi(%g2, %o2, %g3) + addcc %g1, %g5, %g1 + addxc( %g3, %g4, %g2) + ba,pt %xcc, .L8 + add %i0, -40, %i0 + +L(b01): ldx [%i0-8], %g1 + mov 0, %g2 + ba,pt %xcc, .L8 + add %i0, -16, %i0 + +L(b10): ldx [%i0-8], %g2 + ldx [%i0-16], %g1 + add %i0, -24, %i0 + +.L8: add %i1, -5, %g3 + brlz,pn %g3, L(end) + nop + +L(top): ldx [%i0-16], %i4 + mulx %i4, %o4, %o5 + umulxhi(%i4, %o4, %i1) + ldx [%i0-24], %i5 + addcc %o5, %i5, %o5 + addxc( %i1, %g0, %i4) + ldx [%i0-8], %i5 + mulx %i5, %o3, %o7 + umulxhi(%i5, %o3, %i1) + addcc %o5, %o7, %o7 + addxc( %i4, %i1, %i5) + ldx [%i0+0], %g4 + mulx %g4, %o2, %i1 + umulxhi(%g4, %o2, %i4) + addcc %o7, %i1, %i1 + addxc( %i5, %i4, %g4) + mulx %g1, %o1, %i5 + umulxhi(%g1, %o1, %i4) + addcc %i1, %i5, %i5 + addxc( %g4, %i4, %g5) + mulx %g2, %o0, %g1 + umulxhi(%g2, %o0, %g4) + addcc %g1, %i5, %g1 + addxc( %g4, %g5, %g2) + add %g3, -4, %g3 + brgez,pt %g3, L(top) + add %i0, -32, %i0 + +L(end): mulx %g2, %o4, %g5 + umulxhi(%g2, %o4, %g3) + addcc %g1, %g5, %g5 + addxc( %g3, %g0, %g2) + ldx [%i3+8], %i0 + ldx [%i3], %g4 + sub %g0, %i0, %i5 + srlx %g5, %i5, %i5 + sllx %g2, %i0, %g2 + or %i5, %g2, %g1 + mulx %g1, %g4, %l7 + umulxhi(%g1, %g4, %g3) + sllx %g5, %i0, %g2 + add %g1, 1, %g1 + addcc %l7, %g2, %g5 + addxc( %g3, %g1, %g1) + mulx %g1, %i2, %g1 + sub %g2, %g1, %g2 + cmp %g2, %g5 + add %i2, %g2, %g1 + movlu %xcc, %g2, %g1 + subcc %g1, %i2, %g2 + movgeu %xcc, %g2, %g1 + return %i7+8 + srlx %g1, %o0, %o0 +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps) + save %sp, -176, %sp + lzcnt( %i1, %i5) + sllx %i1, %i5, %i1 + call mpn_invert_limb, 0 + mov %i1, %o0 + stx %o0, [%i0] + sra %i5, 0, %g1 + stx %g1, [%i0+8] + sub %g0, %i5, %g2 + srlx %o0, %g2, %g2 + mov 1, %g1 + sllx %g1, %i5, %g1 + or %g2, %g1, %g2 + sub %g0, %i1, %g1 + mulx %g2, %g1, %g2 + srlx %g2, %i5, %g1 + stx %g1, [%i0+16] + + umulxhi(%o0, %g2, %g3) + add %g2, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g2, %o0, %g2 + cmp %g2, %g3 + add %i1, %g3, %g1 + movgeu %xcc, %g3, %g1 + srlx %g1, %i5, %g2 + stx %g2, [%i0+24] + + umulxhi(%o0, %g1, %g3) + add %g1, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g1, %o0, %g1 + cmp %g1, %g3 + add %i1, %g3, %g2 + movgeu %xcc, %g3, %g2 + srlx %g2, %i5, %g1 + stx %g1, [%i0+32] + + umulxhi(%o0, %g2, %g3) + add %g2, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g2, %o0, %g2 + cmp %g2, %g3 + add %i1, %g3, %g1 + movgeu %xcc, %g3, %g1 + srlx %g1, %i5, %g2 + stx %g2, [%i0+40] + + umulxhi(%o0, %g1, %g2) + add %g1, %g2, %g2 + xnor %g0, %g2, %g2 + mulx %g2, %i1, %g2 + mulx %g1, %o0, %o0 + cmp %o0, %g2 + add %i1, %g2, %g3 + movgeu %xcc, %g2, %g3 + srlx %g3, %i5, %i5 + stx %i5, [%i0+48] + + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm new file mode 100644 index 0000000..8744280 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm @@ -0,0 +1,117 @@ +dnl SPARC v9 mpn_mod_34lsub1 for T3/T4/T5. + +dnl Copyright 2005, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: - +C UltraSPARC T3: 5 +C UltraSPARC T4: 1.57 + +C This is based on the powerpc64/mode64 code. + +C INPUT PARAMETERS +define(`up', `%i0') +define(`n', `%i1') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_34lsub1) + save %sp, -176, %sp + + mov 0, %g1 + mov 0, %g3 + mov 0, %g4 + addcc %g0, 0, %g5 + + add n, -3, n + brlz n, L(lt3) + nop + + add n, -3, n + ldx [up+0], %l5 + ldx [up+8], %l6 + ldx [up+16], %l7 + brlz n, L(end) + add up, 24, up + + ALIGN(16) +L(top): addxccc(%g1, %l5, %g1) + ldx [up+0], %l5 + addxccc(%g3, %l6, %g3) + ldx [up+8], %l6 + addxccc(%g4, %l7, %g4) + ldx [up+16], %l7 + add n, -3, n + brgez n, L(top) + add up, 24, up + +L(end): addxccc( %g1, %l5, %g1) + addxccc(%g3, %l6, %g3) + addxccc(%g4, %l7, %g4) + addxc( %g5, %g0, %g5) + +L(lt3): cmp n, -2 + blt L(2) + nop + + ldx [up+0], %l5 + mov 0, %l6 + beq L(1) + addcc %g1, %l5, %g1 + + ldx [up+8], %l6 +L(1): addxccc(%g3, %l6, %g3) + addxccc(%g4, %g0, %g4) + addxc( %g5, %g0, %g5) + +L(2): sllx %g1, 16, %l0 + srlx %l0, 16, %l0 C %l0 = %g1 mod 2^48 + srlx %g1, 48, %l3 C %l3 = %g1 div 2^48 + srl %g3, 0, %g1 + sllx %g1, 16, %l4 C %l4 = (%g3 mod 2^32) << 16 + srlx %g3, 32, %l5 C %l5 = %g3 div 2^32 + sethi %hi(0xffff0000), %g1 + andn %g4, %g1, %g1 + sllx %g1, 32, %l6 C %l6 = (%g4 mod 2^16) << 32 + srlx %g4, 16, %l7 C %l7 = %g4 div 2^16 + + add %l0, %l3, %l0 + add %l4, %l5, %l4 + add %l6, %l7, %l6 + + add %l0, %l4, %l0 + add %l6, %g5, %l6 + + add %l0, %l6, %i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm new file mode 100644 index 0000000..494e1d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm @@ -0,0 +1,82 @@ +dnl SPARC T3/T4/T5 mpn_modexact_1c_odd. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 30 +C UltraSPARC T4/T5: 26 + +C INPUT PARAMETERS +define(`ap', `%o0') +define(`n', `%o1') +define(`d', `%o2') +define(`cy', `%o3') + +define(`dinv',`%o5') +define(`a0', `%g1') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_modexact_1c_odd) + srlx d, 1, %g1 + and %g1, 127, %g1 + + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o5 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %o5, %g1, dinv + add n, -1, n + +L(top): ldx [ap], a0 + add ap, 8, ap + subcc a0, cy, %g3 + mulx %g3, dinv, %g5 + umulxhi(d, %g5, %g5) + addxc( %g5, %g0, cy) + brnz,pt n, L(top) + add n, -1, n + + retl + mov cy, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm new file mode 100644 index 0000000..af05d62 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm @@ -0,0 +1,174 @@ +dnl SPARC v9 mpn_mul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 23 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) + save %sp, -176, %sp + + and n, 3, %g5 + add n, -4, n + brz %g5, L(b0) + cmp %g5, 2 + bcs %xcc, L(b1) + nop + be %xcc, L(b2) + nop + +L(b3): addcc %g0, %g0, %i5 + ldx [up+0], %l0 + ldx [up+8], %l1 + ldx [up+16], %l2 + mulx %l0, v0, %o0 + umulxhi(%l0, v0, %o1) + brgz n, L(gt3) + add rp, -8, rp + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + b L(wd3) + nop +L(gt3): ldx [up+24], %l3 + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + add up, 24, up + b L(lo3) + add n, -3, n + +L(b2): addcc %g0, %g0, %o1 + ldx [up+0], %l1 + ldx [up+8], %l2 + brgz n, L(gt2) + add rp, -16, rp + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + b L(wd2) + nop +L(gt2): ldx [up+16], %l3 + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + ldx [up+24], %l0 + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + add up, 16, up + b L(lo2) + add n, -2, n + +L(b1): addcc %g0, %g0, %o3 + ldx [up+0], %l2 + brgz n, L(gt1) + nop + mulx %l2, v0, %o4 + stx %o4, [rp+0] + umulxhi(%l2, v0, %i0) + ret + restore +L(gt1): ldx [up+8], %l3 + ldx [up+16], %l0 + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + ldx [up+24], %l1 + mulx %l3, v0, %i4 + umulxhi(%l3, v0, %i5) + add rp, -24, rp + add up, 8, up + b L(lo1) + add n, -1, n + +L(b0): addcc %g0, %g0, %o5 + ldx [up+0], %l3 + ldx [up+8], %l0 + ldx [up+16], %l1 + mulx %l3, v0, %i4 + umulxhi(%l3, v0, %i5) + ldx [up+24], %l2 + mulx %l0, v0, %o0 + umulxhi(%l0, v0, %o1) + b L(lo0) + nop + + ALIGN(16) +L(top): ldx [up+0], %l3 C 0 + addxccc(%i4, %o5, %i4) C 0 + mulx %l1, v0, %o2 C 1 + stx %i4, [rp+0] C 1 + umulxhi(%l1, v0, %o3) C 2 +L(lo3): ldx [up+8], %l0 C 2 + addxccc(%o0, %i5, %o0) C 3 + mulx %l2, v0, %o4 C 3 + stx %o0, [rp+8] C 4 + umulxhi(%l2, v0, %o5) C 4 +L(lo2): ldx [up+16], %l1 C 5 + addxccc(%o2, %o1, %o2) C 5 + mulx %l3, v0, %i4 C 6 + stx %o2, [rp+16] C 6 + umulxhi(%l3, v0, %i5) C 7 +L(lo1): ldx [up+24], %l2 C 7 + addxccc(%o4, %o3, %o4) C 8 + mulx %l0, v0, %o0 C 8 + stx %o4, [rp+24] C 9 + umulxhi(%l0, v0, %o1) C 9 + add rp, 32, rp C 10 +L(lo0): add up, 32, up C 10 + brgz n, L(top) C 11 + add n, -4, n C 11 + +L(end): addxccc(%i4, %o5, %i4) + mulx %l1, v0, %o2 + stx %i4, [rp+0] + umulxhi(%l1, v0, %o3) + addxccc(%o0, %i5, %o0) +L(wd3): mulx %l2, v0, %o4 + stx %o0, [rp+8] + umulxhi(%l2, v0, %o5) + addxccc(%o2, %o1, %o2) +L(wd2): stx %o2, [rp+16] + addxccc(%o4, %o3, %o4) + stx %o4, [rp+24] + addxc( %g0, %o5, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm new file mode 100644 index 0000000..de80f3c --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm @@ -0,0 +1,70 @@ +dnl SPARC v9 mpn_popcount for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 15 +C UltraSPARC T4: 2.5 + +C INPUT PARAMETERS +define(`up', `%o0') +define(`n', `%o1') +define(`pcnt', `%o5') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_popcount) + subcc n, 1, n + be L(final_one) + clr pcnt +L(top): + ldx [up + 0], %g1 + sub n, 2, n + ldx [up + 8], %o4 + add up, 16, up + popc %g1, %g2 + popc %o4, %g3 + add pcnt, %g2, pcnt + brgz n, L(top) + add pcnt, %g3, pcnt + brlz,pt n, L(done) + nop +L(final_one): + ldx [up + 0], %g1 + popc %g1, %g2 + add pcnt, %g2, pcnt +L(done): + retl + mov pcnt, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..d46499f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm @@ -0,0 +1,93 @@ +dnl SPARC v9 mpn_sqr_diag_addlsh1 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: ? +C UltraSPARC T4: >= 4.5 + + +define(`rp', `%i0') +define(`tp', `%i1') +define(`up', `%i2') +define(`n', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diag_addlsh1) + save %sp, -176, %sp + + ldx [up+0], %g1 + mulx %g1, %g1, %o0 + umulxhi(%g1, %g1, %g2) + stx %o0, [rp+0] + + ldx [up+8], %g1 + ldx [tp+0], %g4 + ldx [tp+8], %g5 + mulx %g1, %g1, %o0 + orcc %g0, %g0, %o5 + b L(dm) + add n, -2, n + + ALIGN(16) +L(top): ldx [up+8], %g1 + addcc %g4, %o2, %o2 + addxccc(%g5, %o0, %g3) + ldx [tp+16], %g4 + ldx [tp+24], %g5 + mulx %g1, %g1, %o0 + stx %o2, [rp+8] + stx %g3, [rp+16] + add rp, 16, rp + add tp, 16, tp +L(dm): add %g2, %o5, %o2 + umulxhi(%g1, %g1, %g2) + addxccc(%g4, %g4, %g4) + addxccc(%g5, %g5, %g5) + add up, 8, up + addxc( %g0, %g0, %o5) + brnz n, L(top) + add n, -1, n + + addcc %o2, %g4, %g4 + addxccc(%o0, %g5, %g5) + stx %g4, [rp+8] + stx %g5, [rp+16] + addxc( %o5, %g2, %g2) + stx %g2, [rp+24] + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm new file mode 100644 index 0000000..0e4bc93 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm @@ -0,0 +1,144 @@ +dnl SPARC v9 mpn_sub_n for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l0') +define(`u1_off', `%l1') +define(`v0_off', `%l2') +define(`v1_off', `%l3') +define(`r0_off', `%l4') +define(`r1_off', `%l5') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + save %sp, -176, %sp + ba,pt %xcc, L(ent) + xor cy, 1, cy +EPILOGUE() +PROLOGUE(mpn_sub_n) + save %sp, -176, %sp + mov 1, cy +L(ent): + subcc n, 1, n + be L(final_one) + cmp %g0, cy + + ldx [up + 0], %o4 + sllx n, 3, tmp + + ldx [vp + 0], %o5 + add up, tmp, u0_off + + ldx [up + 8], %g5 + add vp, tmp, v0_off + + ldx [vp + 8], %g1 + add rp, tmp, r0_off + + neg tmp, loop_n + add u0_off, 8, u1_off + + add v0_off, 8, v1_off + sub loop_n, -(2 * 8), loop_n + + sub r0_off, 16, r0_off + brgez,pn loop_n, L(loop_tail) + sub r0_off, 8, r1_off + + b,a L(top) + ALIGN(16) +L(top): + xnor %o5, 0, tmp + ldx [loop_n + v0_off], %o5 + + addxccc(%o4, tmp, %g3) + ldx [loop_n + u0_off], %o4 + + xnor %g1, 0, %g1 + stx %g3, [loop_n + r0_off] + + addxccc(%g5, %g1, tmp) + ldx [loop_n + v1_off], %g1 + + ldx [loop_n + u1_off], %g5 + sub loop_n, -(2 * 8), loop_n + + brlz loop_n, L(top) + stx tmp, [loop_n + r1_off] + +L(loop_tail): + xnor %o5, 0, tmp + xnor %g1, 0, %g1 + + addxccc(%o4, tmp, %g3) + add loop_n, u0_off, up + + addxccc(%g5, %g1, %g5) + add loop_n, r0_off, rp + + stx %g3, [rp + 0] + add loop_n, v0_off, vp + + brgz,pt loop_n, L(done) + stx %g5, [rp + 8] + + add rp, (2 * 8), rp + +L(final_one): + ldx [up+0], %o4 + ldx [vp+0], %o5 + xnor %o5, %g0, %o5 + addxccc(%o4, %o5, %g3) + stx %g3, [rp+0] + +L(done): + clr %i0 + movcc %xcc, 1, %i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm new file mode 100644 index 0000000..5635d1b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm @@ -0,0 +1,170 @@ +dnl SPARC v9 mpn_submul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 26 +C UltraSPARC T4: 4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) + save %sp, -176, %sp + ldx [up+0], %g1 + + and n, 3, %g5 + add n, -4, n + brz %g5, L(b00) + cmp %g5, 2 + bcs %xcc, L(b01) + nop + bne %xcc, L(b11) + ldx [up+8], %g4 + +L(b10): add up, 16, up + addcc %g0, 0, %g3 + mulx %g1, v0, %l4 + umulxhi(%g1, v0, %l5) + ldx [rp+0], %o2 + mulx %g4, v0, %l6 + umulxhi(%g4, v0, %l7) + brlz n, L(wd2) + nop +L(gt2): ldx [up+0], %o0 + b L(lo2) + nop + +L(b00): add rp, -16, rp + addcc %g0, 0, %g3 + ldx [up+8], %o1 + mulx %g1, v0, %l0 + umulxhi(%g1, v0, %l1) + ldx [up+16], %o0 + ldx [rp+16], %o2 + mulx %o1, v0, %l2 + umulxhi(%o1, v0, %l3) + b L(lo0) + nop + +L(b01): add up, 8, up + add rp, -8, rp + addcc %g0, 0, %g3 + ldx [rp+8], %o3 + mulx %g1, v0, %l6 + umulxhi(%g1, v0, %l7) + brlz n, L(wd1) + nop + ldx [up+0], %o0 + ldx [up+8], %o1 + mulx %o0, v0, %l0 + umulxhi(%o0, v0, %l1) + b L(lo1) + nop + +L(b11): add up, 24, up + add rp, 8, rp + addcc %g0, 0, %g3 + mulx %g1, v0, %l2 + umulxhi(%g1, v0, %l3) + ldx [up-8], %o1 + ldx [rp-8], %o3 + mulx %g4, v0, %l4 + umulxhi(%g4, v0, %l5) + brlz n, L(end) + nop + + ALIGN(16) +L(top): ldx [up+0], %o0 + addxccc(%g3, %l2, %g1) + ldx [rp+0], %o2 + addxc( %g0, %l3, %g3) + mulx %o1, v0, %l6 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l7) + stx %g4, [rp-8] +L(lo2): ldx [up+8], %o1 + addxccc(%g3, %l4, %g1) + ldx [rp+8], %o3 + addxc( %g0, %l5, %g3) + mulx %o0, v0, %l0 + subcc %o2, %g1, %g4 + umulxhi(%o0, v0, %l1) + stx %g4, [rp+0] +L(lo1): ldx [up+16], %o0 + addxccc(%g3, %l6, %g1) + ldx [rp+16], %o2 + addxc( %g0, %l7, %g3) + mulx %o1, v0, %l2 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l3) + stx %g4, [rp+8] +L(lo0): ldx [up+24], %o1 + addxccc(%g3, %l0, %g1) + ldx [rp+24], %o3 + addxc( %g0, %l1, %g3) + mulx %o0, v0, %l4 + subcc %o2, %g1, %g4 + umulxhi(%o0, v0, %l5) + stx %g4, [rp+16] + add n, -4, n + add up, 32, up + brgez n, L(top) + add rp, 32, rp + +L(end): addxccc(%g3, %l2, %g1) + ldx [rp+0], %o2 + addxc( %g0, %l3, %g3) + mulx %o1, v0, %l6 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l7) + stx %g4, [rp-8] +L(wd2): addxccc(%g3, %l4, %g1) + ldx [rp+8], %o3 + addxc( %g0, %l5, %g3) + subcc %o2, %g1, %g4 + stx %g4, [rp+0] +L(wd1): addxccc(%g3, %l6, %g1) + addxc( %g0, %l7, %g3) + subcc %o3, %g1, %g4 + stx %g4, [rp+8] + addxc( %g0, %g3, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h new file mode 100644 index 0000000..c10fd0d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h @@ -0,0 +1,174 @@ +/* Sparc64 T4-T5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600 MHz ultrasparct5 running GNU/Linux */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-01, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 /* 0.34% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 +/* From gcc105.fsffrance.org, 2023-07-25 */ +#define DIV_QR_1N_PI1_METHOD 4 /* 7.06% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 5 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 654 + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 372 +#define MUL_TOOM6H_THRESHOLD 494 +#define MUL_TOOM8H_THRESHOLD 656 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 126 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 247 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 225 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 188 + +#define SQR_BASECASE_THRESHOLD 20 +#define SQR_TOOM2_THRESHOLD 59 +#define SQR_TOOM3_THRESHOLD 107 +#define SQR_TOOM4_THRESHOLD 298 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 562 + +#define MULMID_TOOM42_THRESHOLD 48 + +#define MULMOD_BNM1_THRESHOLD 25 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 555 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 555, 5}, { 29, 6}, { 31, 7}, { 31, 8}, \ + { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 35, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57,10}, { 15, 8}, { 61, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 59,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 115,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 87,11}, { 47,10}, { 111, 9}, { 223,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 75 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 372, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD 3776 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 168 +#define SQRLO_SQR_THRESHOLD 7511 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 28 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 118 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1970 +#define MU_DIVAPPR_Q_THRESHOLD 1970 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1970 + +#define POWM_SEC_TABLE 1,58,102,1509 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 686 +#define SET_STR_PRECOMPUTE_THRESHOLD 2717 + +#define FAC_DSC_THRESHOLD 336 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 32 +#define HGCD2_DIV1_METHOD 1 /* 0.66% faster than 3 */ +#define HGCD_THRESHOLD 57 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 288 +#define JACOBI_BASE_METHOD 4 /* 2.50% faster than 3 */ -- cgit v1.2.3