From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/alpha/ev6/add_n.asm | 283 ++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm | 172 ++++++++++ gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm | 398 +++++++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h | 209 ++++++++++++ gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm | 336 +++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/mul_1.asm | 496 +++++++++++++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/README | 65 ++++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm | 396 +++++++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm | 146 +++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm | 169 ++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm | 210 ++++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm | 233 ++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h | 72 +++++ gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm | 364 +++++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm | 396 +++++++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/slot.pl | 318 ++++++++++++++++++ gmp-6.3.0/mpn/alpha/ev6/sub_n.asm | 283 ++++++++++++++++ 17 files changed, 4546 insertions(+) create mode 100644 gmp-6.3.0/mpn/alpha/ev6/add_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/mul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/README create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm create mode 100755 gmp-6.3.0/mpn/alpha/ev6/slot.pl create mode 100644 gmp-6.3.0/mpn/alpha/ev6/sub_n.asm (limited to 'gmp-6.3.0/mpn/alpha/ev6') diff --git a/gmp-6.3.0/mpn/alpha/ev6/add_n.asm b/gmp-6.3.0/mpn/alpha/ev6/add_n.asm new file mode 100644 index 0000000..9261f31 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/add_n.asm @@ -0,0 +1,283 @@ +dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 5.4 +C EV6: 2.125 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C vp r18 +C n r19 +C cy r20 (for mpn_add_nc) + +C TODO +C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) +C Use multi-pronged feed-in. +C Perform additional micro-tuning + +C This code was written in cooperation with ev6 pipeline expert Steve Root. + +C Pair loads and stores where possible +C Store pairs oct-aligned where possible (didn't need it here) +C Stores are delayed every third cycle +C Loads and stores are delayed by fills +C U stays still, put code there where possible (note alternation of U1 and U0) +C L moves because of loads and stores +C Note dampers in L to limit damage + +C This odd-looking optimization expects that were having random bits in our +C data, so that a pure zero result is unlikely. so we penalize the unlikely +C case to help the common case. + +define(`u0', `r0') define(`u1', `r3') +define(`v0', `r1') define(`v1', `r4') + +define(`cy0', `r20') define(`cy1', `r21') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) + +ASM_START() +PROLOGUE(mpn_add_nc) + br r31, $entry +EPILOGUE() +PROLOGUE(mpn_add_n) + bis r31, r31, cy0 C clear carry in +$entry: cmpult r19, 5, r22 C L1 move counter + ldq u1, 0(r17) C L0 get next ones + ldq v1, 0(r18) C L1 + bne r22, $Lsmall + + ldq u0, 8(r17) C L0 get next ones + ldq v0, 8(r18) C L1 + addq u1, v1, r5 C U0 add two data + + cmpult r5, v1, r23 C U0 did it carry + ldq u1, 16(r17) C L0 get next ones + ldq v1, 16(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy0, r5 C U0 carry in + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix5f C U0 fix exact zero +$ret5f: ldq u0, 24(r17) C L0 get next ones + ldq v0, 24(r18) C L1 + + addq r8, r23, r8 C U1 carry from last + addq u1, v1, r7 C U0 add two data + + beq r8, $fix6f C U1 fix exact zero +$ret6f: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 32(r17) C L0 get next ones + ldq v1, 32(r18) C L1 + + lda r17, 40(r17) C L0 move pointer + lda r18, 40(r18) C L1 move pointer + + lda r16, -8(r16) + lda r19, -13(r19) C L1 move counter + blt r19, $Lend C U1 loop control + + +C Main loop. 8-way unrolled. + ALIGN(16) +$Loop: addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, 8(r16) C L0 put an answer + stq r8, 16(r16) C L1 pair + + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix7 C U0 fix exact 0 +$ret7: ldq u0, 0(r17) C L0 get next ones + ldq v0, 0(r18) C L1 + + bis r31, r31, r31 C L damp out + addq r2, r23, r2 C U1 carry from last + bis r31, r31, r31 C L moves in L ! + addq u1, v1, r5 C U0 add two data + + beq r2, $fix0 C U1 fix exact zero +$ret0: cmpult r5, v1, cy0 C U0 did it carry + ldq u1, 8(r17) C L0 get next ones + ldq v1, 8(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy1, r5 C U0 carry from last + stq r7, 24(r16) C L0 store pair + stq r2, 32(r16) C L1 + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix1 C U0 fix exact zero +$ret1: ldq u0, 16(r17) C L0 get next ones + ldq v0, 16(r18) C L1 + + lda r16, 64(r16) C L0 move pointer + addq r8, cy0, r8 C U1 carry from last + lda r19, -8(r19) C L1 move counter + addq u1, v1, r7 C U0 add two data + + beq r8, $fix2 C U1 fix exact zero +$ret2: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 24(r17) C L0 get next ones + ldq v1, 24(r18) C L1 + + addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, -24(r16) C L0 put an answer + stq r8, -16(r16) C L1 pair + + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix3 C U0 fix exact 0 +$ret3: ldq u0, 32(r17) C L0 get next ones + ldq v0, 32(r18) C L1 + + bis r31, r31, r31 C L damp out + addq r2, r23, r2 C U1 carry from last + bis r31, r31, r31 C L moves in L ! + addq u1, v1, r5 C U0 add two data + + beq r2, $fix4 C U1 fix exact zero +$ret4: cmpult r5, v1, cy0 C U0 did it carry + ldq u1, 40(r17) C L0 get next ones + ldq v1, 40(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy1, r5 C U0 carry from last + stq r7, -8(r16) C L0 store pair + stq r2, 0(r16) C L1 + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix5 C U0 fix exact zero +$ret5: ldq u0, 48(r17) C L0 get next ones + ldq v0, 48(r18) C L1 + + ldl r31, 256(r17) C L0 prefetch + addq r8, cy0, r8 C U1 carry from last + ldl r31, 256(r18) C L1 prefetch + addq u1, v1, r7 C U0 add two data + + beq r8, $fix6 C U1 fix exact zero +$ret6: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 56(r17) C L0 get next ones + ldq v1, 56(r18) C L1 + + lda r17, 64(r17) C L0 move pointer + bis r31, r31, r31 C U + lda r18, 64(r18) C L1 move pointer + bge r19, $Loop C U1 loop control +C ==== main loop end + +$Lend: addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, 8(r16) C L0 put an answer + stq r8, 16(r16) C L1 pair + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix7c C U0 fix exact 0 +$ret7c: addq r2, r23, r2 C U1 carry from last + addq u1, v1, r5 C U0 add two data + beq r2, $fix0c C U1 fix exact zero +$ret0c: cmpult r5, v1, cy0 C U0 did it carry + addq r5, cy1, r5 C U0 carry from last + stq r7, 24(r16) C L0 store pair + stq r2, 32(r16) C L1 + beq r5, $fix1c C U0 fix exact zero +$ret1c: stq r5, 40(r16) C L0 put an answer + lda r16, 48(r16) C L0 move pointer + + lda r19, 8(r19) + beq r19, $Lret + + ldq u1, 0(r17) + ldq v1, 0(r18) +$Lsmall: + lda r19, -1(r19) + beq r19, $Lend0 + + ALIGN(8) +$Loop0: addq u1, v1, r2 C main add + cmpult r2, v1, r8 C compute cy from last add + ldq u1, 8(r17) + ldq v1, 8(r18) + addq r2, cy0, r5 C carry add + lda r17, 8(r17) + lda r18, 8(r18) + stq r5, 0(r16) + cmpult r5, r2, cy0 C compute cy from last add + lda r19, -1(r19) C decr loop cnt + bis r8, cy0, cy0 C combine cy from the two adds + lda r16, 8(r16) + bne r19, $Loop0 +$Lend0: addq u1, v1, r2 C main add + addq r2, cy0, r5 C carry add + cmpult r2, v1, r8 C compute cy from last add + cmpult r5, r2, cy0 C compute cy from last add + stq r5, 0(r16) + bis r8, cy0, r0 C combine cy from the two adds + ret r31,(r26),1 + + ALIGN(8) +$Lret: lda r0, 0(cy0) C copy carry into return register + ret r31,(r26),1 + +$fix5f: bis r23, cy0, r23 C bring forward carry + br r31, $ret5f +$fix6f: bis r22, r23, r22 C bring forward carry + br r31, $ret6f +$fix0: bis cy1, r23, cy1 C bring forward carry + br r31, $ret0 +$fix1: bis cy0, cy1, cy0 C bring forward carry + br r31, $ret1 +$fix2: bis r22, cy0, r22 C bring forward carry + br r31, $ret2 +$fix3: bis r23, r22, r23 C bring forward carry + br r31, $ret3 +$fix4: bis cy1, r23, cy1 C bring forward carry + br r31, $ret4 +$fix5: bis cy1, cy0, cy0 C bring forward carry + br r31, $ret5 +$fix6: bis r22, cy0, r22 C bring forward carry + br r31, $ret6 +$fix7: bis r23, r22, r23 C bring forward carry + br r31, $ret7 +$fix0c: bis cy1, r23, cy1 C bring forward carry + br r31, $ret0c +$fix1c: bis cy0, cy1, cy0 C bring forward carry + br r31, $ret1c +$fix7c: bis r23, r22, r23 C bring forward carry + br r31, $ret7c + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm b/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm new file mode 100644 index 0000000..cb966ce --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm @@ -0,0 +1,172 @@ +dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). + +dnl Copyright 2003, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 7 +C EV6: 4 + +C TODO +C * Tune to reach 3.75 c/l on ev6. + +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n', `r19') + +define(`u0', `r8') +define(`u1', `r1') +define(`v0', `r4') +define(`v1', `r5') + +define(`cy0', `r0') +define(`cy1', `r20') +define(`cy', `r22') +define(`rr', `r24') +define(`ps', `r25') +define(`sl', `r28') + +ifdef(`OPERATION_addlsh1_n',` + define(ADDSUB, addq) + define(CARRY, `cmpult $1,$2,$3') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADDSUB, subq) + define(CARRY, `cmpult $2,$1,$3') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + and n, 2, cy0 + blbs n, L(bx1) +L(bx0): ldq v1, 0(vp) + ldq u1, 0(up) + lda r2, 0(r31) + bne cy0, L(b10) + +L(b00): lda vp, 48(vp) + lda up, -16(up) + lda rp, -8(rp) + lda cy0, 0(r31) + br r31, L(lo0) + +L(b10): lda vp, 32(vp) + lda rp, 8(rp) + lda cy0, 0(r31) + br r31, L(lo2) + +L(bx1): ldq v0, 0(vp) + ldq u0, 0(up) + lda r3, 0(r31) + beq cy0, L(b01) + +L(b11): lda vp, 40(vp) + lda up, -24(up) + lda rp, 16(rp) + lda cy1, 0(r31) + br r31, L(lo3) + +L(b01): lda n, -4(n) + lda cy1, 0(r31) + ble n, L(end) + lda vp, 24(vp) + lda up, -8(up) + + ALIGN(16) +L(top): addq v0, v0, r6 + ldq v1, -16(vp) + addq r6, r3, sl C combined vlimb + ldq u1, 16(up) + ADDSUB u0, sl, ps C ulimb + (vlimb << 1) + cmplt v0, r31, r2 C high v bits + ADDSUB ps, cy1, rr C consume carry from previous operation + CARRY( ps, u0, cy0) C carry out #2 + stq rr, 0(rp) + CARRY( rr, ps, cy) C carry out #3 + lda vp, 32(vp) C bookkeeping + addq cy, cy0, cy0 C final carry out +L(lo0): addq v1, v1, r7 + ldq v0, -40(vp) + addq r7, r2, sl + ldq u0, 24(up) + ADDSUB u1, sl, ps + cmplt v1, r31, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, 8(rp) + CARRY( rr, ps, cy) + lda rp, 32(rp) C bookkeeping + addq cy, cy1, cy1 +L(lo3): addq v0, v0, r6 + ldq v1, -32(vp) + addq r6, r3, sl + ldq u1, 32(up) + ADDSUB u0, sl, ps + cmplt v0, r31, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, -16(rp) + CARRY( rr, ps, cy) + lda up, 32(up) C bookkeeping + addq cy, cy0, cy0 +L(lo2): addq v1, v1, r7 + ldq v0, -24(vp) + addq r7, r2, sl + ldq u0, 8(up) + ADDSUB u1, sl, ps + cmplt v1, r31, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, -8(rp) + CARRY( rr, ps, cy) + lda n, -4(n) C bookkeeping + addq cy, cy1, cy1 + bgt n, L(top) + +L(end): addq v0, v0, r6 + addq r6, r3, sl + ADDSUB u0, sl, ps + cmplt v0, r31, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, 0(rp) + CARRY( rr, ps, cy) + addq cy, cy0, cy0 + addq cy0, r2, r0 + + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm new file mode 100644 index 0000000..0e68e6e --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm @@ -0,0 +1,398 @@ +dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.5 + +C INPUT PARAMETERS +define(`rp', `r16') +define(`up', `r17') +define(`n', `r18') +define(`v0', `r19') + +dnl This code was written in cooperation with ev6 pipeline expert Steve Root. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the carry's +dnl has to happen on one side {0} of the machine. + +dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. +dnl The lockup on U0 means that any stall can't be recovered from. Consider a +dnl ldq in L1, say that load gets stalled because it collides with a fill from +dnl the b_cache. On the next cycle, this load gets priority. If first looks +dnl at L0, and goes there. The instruction we intended for L0 gets to look at +dnl L1, which is NOT where we want it. It either stalls 1, because it can't +dnl go in L0, or goes there, and causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles back +dnl into the code! And, of course, put in lds prefetch for the rp[] operand. +dnl At a place where we have an mt followed by a bookkeeping, put the +dnl bookkeeping in upper, and the prefetch into lower. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd +dnl like not to have an ldq or an stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `addq') + define(`CMPCY', `cmpult $2,$1') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `subq') + define(`CMPCY', `cmpult $1,$2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + ldq r3, 0(up) C + and r18, 7, r20 C + lda r18, -9(r18) C + cmpeq r20, 1, r21 C + beq r21, $L1 C + +$1mod8: ldq r5, 0(rp) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r0 C + stq r23, 0(rp) C + bge r18, $ent1 C + ret r31, (r26), 1 C + +$L1: lda r8, 0(r31) C zero carry reg + lda r24, 0(r31) C zero carry reg + cmpeq r20, 2, r21 C + bne r21, $2mod8 C + cmpeq r20, 3, r21 C + bne r21, $3mod8 C + cmpeq r20, 4, r21 C + bne r21, $4mod8 C + cmpeq r20, 5, r21 C + bne r21, $5mod8 C + cmpeq r20, 6, r21 C + bne r21, $6mod8 C + cmpeq r20, 7, r21 C + beq r21, $0mod8 C + +$7mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$6mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 48(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, -32(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent6 C + +$ent1: lda up, 8(up) C + lda rp, 8(rp) C + lda r8, 0(r0) C + ldq r3, 0(up) C +$0mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + mulq v0, r0, r25 C + ldq r5, 8(rp) C + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + lda rp, -16(rp) C + br r31, $ent0 C + +$3mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$2mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ble r18, $n23 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 16(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, 0(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent2 C + +$5mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r8 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$4mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + lda up, 32(up) C L1 bookkeeping + mulq v0, r0, r25 C + ldq r5, 8(rp) C + lda rp, 16(rp) C L1 bookkeeping + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ble r18, $Lend C + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, 0(up) C + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 0(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, 8(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 8(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, -16(rp) C L0 + stq r23, -8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent2: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -8(r18) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, 16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, 16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, 24(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, 24(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, 0(rp) C L0 + stq r23, 8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry +$ent0: + CMPCY( r4, r2), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda up, 64(up) C L1 bookkeeping + ADDSUB r2, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, -32(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 32(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, -24(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 40(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, 16(rp) C L0 + stq r23, 24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent6: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda rp, 64(rp) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, -16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, -16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, -8(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, -8(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, -32(rp) C L0 + stq r23, -24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ldl r31, 256(up) C prefetch up[] + bgt r18, $Loop C U1 bookkeeping + +$Lend: CMPCY( r2, r22), r21 C + addq r6, r20, r6 C + ADDSUB r5, r7, r7 C + addq r6, r21, r6 C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + CMPCY( r5, r7), r20 C + ADDSUB r7, r6, r23 C + CMPCY(r7, r23), r21 C + addq r24, r20, r24 C + ldq r5, 8(rp) C + ADDSUB r4, r25, r25 C + stq r22, -16(rp) C + stq r23, -8(rp) C + addq r24, r21, r24 C + br L(x) + + ALIGN(16) +$n23: ldq r4, 0(rp) C + ldq r5, 8(rp) C + umulh v0, r1, r8 C + ADDSUB r4, r25, r25 C +L(x): CMPCY( r4, r25), r20 C + ADDSUB r25, r24, r22 C + CMPCY( r25, r22), r21 C + addq r3, r20, r3 C + ADDSUB r5, r28, r28 C + addq r3, r21, r3 C + CMPCY( r5, r28), r20 C + ADDSUB r28, r3, r23 C + CMPCY( r28, r23), r21 C + addq r8, r20, r8 C + stq r22, 0(rp) C + stq r23, 8(rp) C + addq r8, r21, r0 C + ret r31, (r26), 1 C +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h new file mode 100644 index 0000000..e51d6b0 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h @@ -0,0 +1,209 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */ + +/* 500 MHz 21164 (agnesi.math.su.se) */ +/* FFT tuning limit = 20000000 */ +/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 2 +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 8 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 117 +#define MUL_TOOM44_THRESHOLD 124 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 59 +#define SQR_TOOM3_THRESHOLD 123 +#define SQR_TOOM4_THRESHOLD 163 +#define SQR_TOOM6_THRESHOLD 333 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 52 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 5 + +#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \ + { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 151 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \ + { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 159 +#define SQR_FFT_THRESHOLD 5056 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 100 +#define MULLO_MUL_N_THRESHOLD 11355 + +#define DC_DIV_QR_THRESHOLD 124 +#define DC_DIVAPPR_Q_THRESHOLD 438 +#define DC_BDIV_QR_THRESHOLD 153 +#define DC_BDIV_Q_THRESHOLD 318 + +#define INV_MULMOD_BNM1_THRESHOLD 62 +#define INV_NEWTON_THRESHOLD 384 +#define INV_APPR_THRESHOLD 402 + +#define BINV_NEWTON_THRESHOLD 381 +#define REDC_1_TO_REDC_N_THRESHOLD 110 + +#define MU_DIV_QR_THRESHOLD 1752 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 174 +#define MU_BDIV_QR_THRESHOLD 1387 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 1,13,66,82,579 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 318 +#define HGCD_APPR_THRESHOLD 363 +#define HGCD_REDUCE_THRESHOLD 2384 +#define GCD_DC_THRESHOLD 2504 +#define GCDEXT_DC_THRESHOLD 671 +#define JACOBI_BASE_METHOD 3 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 3754 +#define SET_STR_PRECOMPUTE_THRESHOLD 8097 + +#define FAC_DSC_THRESHOLD 951 +#define FAC_ODD_THRESHOLD 24 diff --git a/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm new file mode 100644 index 0000000..82c42ae --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm @@ -0,0 +1,336 @@ +dnl Alpha mpn_mod_1s_4p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimise. 2.75 c/l should be possible. +C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated. +C * Optimise feed-in code, starting the sw pipeline in switch code. +C * Shorten software pipeline. The mul instructions are scheduled too far +C from their users. Fixing this will allow us to use fewer registers. +C * If we cannot reduce register usage, write perhaps small-n basecase. +C * Does this work for PIC? + +C cycles/limb +C EV4: ? +C EV5: 23 +C EV6: 3 + +define(`ap', `r16') +define(`n', `r17') +define(`pl', `r24') +define(`ph', `r25') +define(`rl', `r6') +define(`rh', `r7') +define(`B1modb', `r1') +define(`B2modb', `r2') +define(`B3modb', `r3') +define(`B4modb', `r4') +define(`B5modb', `r5') + +ASM_START() +PROLOGUE(mpn_mod_1s_4p) + lda r30, -64(r30) + stq r9, 8(r30) + ldq B1modb, 16(r19) + stq r10, 16(r30) + ldq B2modb, 24(r19) + stq r11, 24(r30) + ldq B3modb, 32(r19) + stq r12, 32(r30) + ldq B4modb, 40(r19) + stq r13, 40(r30) + ldq B5modb, 48(r19) + s8addq n, ap, ap C point ap at vector end + + and n, 3, r0 + lda n, -4(n) + beq r0, L(b0) + lda r6, -2(r0) + blt r6, L(b1) + beq r6, L(b2) + +L(b3): ldq r21, -16(ap) + ldq r22, -8(ap) + ldq r20, -24(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + addq r8, r20, pl + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, rl + cmpult rl, r9, r0 + addq r13, ph, ph + addq r0, ph, rh + lda ap, -56(ap) + br L(com) + +L(b0): ldq r21, -24(ap) + ldq r22, -16(ap) + ldq r23, -8(ap) + ldq r20, -32(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + mulq r23, B3modb, r10 + umulh r23, B3modb, r27 + addq r8, r20, pl + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, pl + cmpult pl, r9, r0 + addq r13, ph, ph + addq r0, ph, ph + addq r10, pl, rl + cmpult rl, r10, r0 + addq r27, ph, ph + addq r0, ph, rh + lda ap, -64(ap) + br L(com) + +L(b1): bis r31, r31, rh + ldq rl, -8(ap) + lda ap, -40(ap) + br L(com) + +L(b2): ldq rh, -8(ap) + ldq rl, -16(ap) + lda ap, -48(ap) + +L(com): ble n, L(ed3) + ldq r21, 8(ap) + ldq r22, 16(ap) + ldq r23, 24(ap) + ldq r20, 0(ap) + lda n, -4(n) + lda ap, -32(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + mulq r23, B3modb, r10 + umulh r23, B3modb, r27 + mulq rl, B4modb, r11 + umulh rl, B4modb, r28 + ble n, L(ed2) + + ALIGN(16) +L(top): ldq r21, 8(ap) + mulq rh, B5modb, rl + addq r8, r20, pl + ldq r22, 16(ap) + cmpult pl, r8, r0 + umulh rh, B5modb, rh + ldq r23, 24(ap) + addq r0, r12, ph + addq r9, pl, pl + mulq r21, B1modb, r8 + cmpult pl, r9, r0 + addq r13, ph, ph + umulh r21, B1modb, r12 + lda ap, -32(ap) + addq r0, ph, ph + addq r10, pl, pl + mulq r22, B2modb, r9 + cmpult pl, r10, r0 + addq r27, ph, ph + addq r11, pl, pl + umulh r22, B2modb, r13 + addq r0, ph, ph + cmpult pl, r11, r0 + addq r28, ph, ph + mulq r23, B3modb, r10 + ldq r20, 32(ap) + addq pl, rl, rl + umulh r23, B3modb, r27 + addq r0, ph, ph + cmpult rl, pl, r0 + mulq rl, B4modb, r11 + addq ph, rh, rh + umulh rl, B4modb, r28 + addq r0, rh, rh + lda n, -4(n) + bgt n, L(top) + +L(ed2): mulq rh, B5modb, rl + addq r8, r20, pl + umulh rh, B5modb, rh + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, pl + cmpult pl, r9, r0 + addq r13, ph, ph + addq r0, ph, ph + addq r10, pl, pl + cmpult pl, r10, r0 + addq r27, ph, ph + addq r11, pl, pl + addq r0, ph, ph + cmpult pl, r11, r0 + addq r28, ph, ph + addq pl, rl, rl + addq r0, ph, ph + cmpult rl, pl, r0 + addq ph, rh, rh + addq r0, rh, rh + +L(ed3): mulq rh, B1modb, r8 + umulh rh, B1modb, rh + addq r8, rl, rl + cmpult rl, r8, r0 + addq r0, rh, rh + + ldq r24, 8(r19) C cnt + sll rh, r24, rh + subq r31, r24, r25 + srl rl, r25, r2 + sll rl, r24, rl + or r2, rh, rh + + ldq r23, 0(r19) C bi + mulq rh, r23, r8 + umulh rh, r23, r9 + addq rh, 1, r7 + addq r8, rl, r8 C ql + cmpult r8, rl, r0 + addq r9, r7, r9 + addq r0, r9, r9 C qh + mulq r9, r18, r21 C qh * b + subq rl, r21, rl + cmpult r8, rl, r0 C rl > ql + negq r0, r0 + and r0, r18, r0 + addq rl, r0, rl + cmpule r18, rl, r0 C rl >= b + negq r0, r0 + and r0, r18, r0 + subq rl, r0, rl + + srl rl, r24, r0 + + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) + lda r30, 64(r30) + ret r31, (r26), 1 +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps,gp) + lda r30, -32(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + mov r16, r11 + LEA( r4, __clz_tab) + lda r10, 65(r31) + cmpbge r31, r17, r1 + srl r1, 1, r1 + xor r1, 127, r1 + addq r1, r4, r1 + ldq_u r2, 0(r1) + extbl r2, r1, r2 + s8subq r2, 7, r2 + srl r17, r2, r3 + subq r10, r2, r10 + addq r3, r4, r3 + ldq_u r1, 0(r3) + extbl r1, r3, r1 + subq r10, r1, r10 + sll r17, r10, r9 + mov r9, r16 + jsr r26, mpn_invert_limb + LDGP( r29, 0(r26)) + subq r31, r10, r2 + lda r1, 1(r31) + sll r1, r10, r1 + subq r31, r9, r3 + srl r0, r2, r2 + ldq r26, 0(r30) + bis r2, r1, r2 + stq r0, 0(r11) + stq r10, 8(r11) + mulq r2, r3, r2 + srl r2, r10, r3 + umulh r2, r0, r1 + stq r3, 16(r11) + mulq r2, r0, r3 + ornot r31, r1, r1 + subq r1, r2, r1 + mulq r1, r9, r1 + addq r1, r9, r2 + cmpule r1, r3, r3 + cmoveq r3, r2, r1 + srl r1, r10, r3 + umulh r1, r0, r2 + stq r3, 24(r11) + mulq r1, r0, r3 + ornot r31, r2, r2 + subq r2, r1, r2 + mulq r2, r9, r2 + addq r2, r9, r1 + cmpule r2, r3, r3 + cmoveq r3, r1, r2 + srl r2, r10, r1 + umulh r2, r0, r3 + stq r1, 32(r11) + mulq r2, r0, r1 + ornot r31, r3, r3 + subq r3, r2, r3 + mulq r3, r9, r3 + addq r3, r9, r2 + cmpule r3, r1, r1 + cmoveq r1, r2, r3 + srl r3, r10, r2 + umulh r3, r0, r1 + stq r2, 40(r11) + mulq r3, r0, r0 + ornot r31, r1, r1 + subq r1, r3, r1 + mulq r1, r9, r1 + addq r1, r9, r9 + cmpule r1, r0, r0 + cmoveq r0, r9, r1 + ldq r9, 8(r30) + srl r1, r10, r1 + ldq r10, 16(r30) + stq r1, 48(r11) + ldq r11, 24(r30) + lda r30, 32(r30) + ret r31, (r26), 1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm new file mode 100644 index 0000000..8ee19cd --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm @@ -0,0 +1,496 @@ +dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r16 +C s1_ptr r17 +C size r18 +C s2_limb r19 + +C This code runs at 2.25 cycles/limb on EV6. + +C This code was written in close cooperation with ev6 pipeline expert +C Steve Root. Any errors are tege's fault, though. + +C Code structure: + +C code for n < 8 +C code for n > 8 code for (n mod 8) +C code for (n div 8) feed-in code +C 8-way unrolled loop +C wind-down code + +C Some notes about unrolled loop: +C +C r1-r8 multiplies and workup +C r21-r28 multiplies and workup +C r9-r12 loads +C r0 -1 +C r20,r29,r13-r15 scramble +C +C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a +C put-the-carry-into-hi. The idea is that these branches are very rarely +C taken, and since a non-taken branch consumes no resources, that is better +C than an addq. +C +C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an +C add NEXT cycle #09 which feeds a store in NEXT cycle #02 + +C The code could use some further work: +C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is +C faster than this for size < 3. +C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless +C that is too costly. +C 3. Consider using 4-way unrolling, even if that runs slower. +C 4. Reduce register usage. In particular, try to avoid using r29. + +ASM_START() +PROLOGUE(mpn_mul_1) + cmpult r18, 8, r1 + beq r1, $Large +$Lsmall: + ldq r2,0(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Le1a C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + stq r3,0(r16) + beq r18,$Le2a C jump if size was == 2 + ALIGN(8) +$Lopa: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + lda r18,-1(r18) C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r18,$Lopa + +$Le2a: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Le1a: stq r3,0(r16) + ret r31,(r26),1 + +$Large: + lda r30, -224(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + stq r29, 64(r30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r21 + beq r20, $L_8_or_more C skip first loop + +$L_9_or_more: + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + mulq r2,r19,r3 C r3 = prod_low + umulh r2,r19,r21 C r21 = prod_high + beq r20,$Le1b C jump if size was == 1 + bis r31, r31, r0 C FIXME: shouldn't need this + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + beq r20,$Le2b C jump if size was == 2 + ALIGN(8) +$Lopb: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + lda r20,-1(r20) C size-- + umulh r2,r19,r21 C r21 = prod_high + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,0(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r20,$Lopb + +$Le2b: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r21 C r21 = prod_high + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + addq r21,r0,r21 C cy_limb = prod_high + cy + br r31, $L_8_or_more +$Le1b: stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + +$L_8_or_more: + lda r0, -1(r31) C put -1 in r0, for tricky loop control + lda r17, -32(r17) C L1 bookkeeping + lda r18, -1(r18) C decrement count + + ldq r9, 32(r17) C L1 + ldq r10, 40(r17) C L1 + mulq r9, r19, r22 C U1 #07 + ldq r11, 48(r17) C L1 + umulh r9, r19, r23 C U1 #08 + ldq r12, 56(r17) C L1 + mulq r10, r19, r24 C U1 #09 + ldq r9, 64(r17) C L1 + + lda r17, 64(r17) C L1 bookkeeping + + umulh r10, r19, r25 C U1 #11 + mulq r11, r19, r26 C U1 #12 + umulh r11, r19, r27 C U1 #13 + mulq r12, r19, r28 C U1 #14 + ldq r10, 8(r17) C L1 + umulh r12, r19, r1 C U1 #15 + ldq r11, 16(r17) C L1 + mulq r9, r19, r2 C U1 #16 + ldq r12, 24(r17) C L1 + umulh r9, r19, r3 C U1 #17 + addq r21, r22, r13 C L1 mov + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $L_16_or_more + + cmpult r22, r24, r24 C U0 carry from sum + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + umulh r12, r19, r21 C U1 #06 + br r31, $ret0c + +$L_16_or_more: +C --------------------------------------------------------------- + subq r18,1,r18 + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 +C beq r13, $fix0w C U0 +$ret0w: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1w C U0 +$ret1w: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2w C U0 +$ret2w: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3w C U0 +$ret3w: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + cmpult r14, r6, r3 C U0 carry from sum +C could do cross-jumping here: +C bra $L_middle_of_unrolled_loop + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4w: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5w: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6w: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + ble r18, $Lend C U0 +C --------------------------------------------------------------- + ALIGN(16) +$Loop: + umulh r0, r18, r18 C U1 #01 decrement r18! + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0 C U0 +$ret0: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1 C U0 +$ret1: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2 C U0 +$ret2: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3 C U0 +$ret3: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + bis r31, r31, r31 C U1 mul hole + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + cmpult r14, r6, r3 C U0 carry from sum +$L_middle_of_unrolled_loop: + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $Loop C U0 +C --------------------------------------------------------------- +$Lend: + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0c C U0 +$ret0c: addq r22, r14, r26 C L0 + beq r26, $fix1c C U0 +$ret1c: addq r23, r24, r27 C L0 + beq r27, $fix2c C U0 +$ret2c: addq r28, r25, r28 C L0 + beq r28, $fix3c C U0 +$ret3c: addq r1, r2, r20 C L0 sum 2 mul's + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + cmpult r14, r6, r3 C U0 carry from sum + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + beq r20, $fix4c C U0 +$ret4c: addq r2, r29, r6 C L0 + beq r6, $fix5c C U0 +$ret5c: addq r14, r4, r7 C L0 + beq r7, $fix6c C U0 +$ret6c: addq r5, r8, r8 C L0 sum 2 + cmpult r8, r5, r29 C L0 carry from last bunch + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + addq r29, r21, r0 + + ldq r26, 0(r30) + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + ldq r29, 64(r30) + lda r30, 224(r30) + ret r31, (r26), 1 + +C $fix0w: bis r14, r29, r14 C join carries +C br r31, $ret0w +$fix1w: bis r24, r14, r24 C join carries + br r31, $ret1w +$fix2w: bis r25, r24, r25 C join carries + br r31, $ret2w +$fix3w: bis r15, r25, r15 C join carries + br r31, $ret3w +$fix0: bis r14, r29, r14 C join carries + br r31, $ret0 +$fix1: bis r24, r14, r24 C join carries + br r31, $ret1 +$fix2: bis r25, r24, r25 C join carries + br r31, $ret2 +$fix3: bis r15, r25, r15 C join carries + br r31, $ret3 +$fix4: bis r29, r15, r29 C join carries + br r31, $ret4 +$fix5: bis r4, r29, r4 C join carries + br r31, $ret5 +$fix6: addq r5, r4, r5 C can't carry twice! + br r31, $ret6 +$fix0c: bis r14, r29, r14 C join carries + br r31, $ret0c +$fix1c: bis r24, r14, r24 C join carries + br r31, $ret1c +$fix2c: bis r25, r24, r25 C join carries + br r31, $ret2c +$fix3c: bis r15, r25, r15 C join carries + br r31, $ret3c +$fix4c: bis r29, r15, r29 C join carries + br r31, $ret4c +$fix5c: bis r4, r29, r4 C join carries + br r31, $ret5c +$fix6c: addq r5, r4, r5 C can't carry twice! + br r31, $ret6c + +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/README b/gmp-6.3.0/mpn/alpha/ev6/nails/README new file mode 100644 index 0000000..b214ac5 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/README @@ -0,0 +1,65 @@ +Copyright 2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains assembly code for nails-enabled 21264. The code is not +very well optimized. + +For addmul_N, as N grows larger, we could make multiple loads together, then do +about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This +would surely allow addmul_4 to run at 2 c/l, but the same should be possible +also for addmul_3 and perhaps even addmul_2. + + + current fair best +Routine c/l unroll c/l unroll c/l i/c +mul_1 3.25 2.75 2.75 3.273 +addmul_1 4.0 4 3.5 4 14 3.25 3.385 +addmul_2 4.0 1 2.5 2 10 2.25 3.333 +addmul_3 3.0 1 2.33 2 14 2 3.333 +addmul_4 2.5 1 2.125 2 17 2 3.135 + +addmul_5 2 1 10 +addmul_6 2 1 12 +addmul_7 2 1 14 + +(The "best" column doesn't account for bookkeeping instructions and +thereby assumes infinite unrolling.) + +Basecase usages: + +1 addmul_1 +2 addmul_2 +3 addmul_3 +4 addmul_4 +5 addmul_3 + addmul_2 2.3998 +6 addmul_4 + addmul_2 +7 addmul_4 + addmul_3 diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm new file mode 100644 index 0000000..711d4e6 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_addmul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_addmul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq rl1, t0, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + addq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + addq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + addq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + addq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm new file mode 100644 index 0000000..6ff6b3a --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm @@ -0,0 +1,146 @@ +dnl Alpha ev6 nails mpn_addmul_2. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 4.0 cycles/limb. + +C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l, +C or 4-way unrolling over 20 cycles, for 2.5 c/l. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') + +define(`acc0',`r4') +define(`acc1',`r5') + +define(`v0',`r6') +define(`v1',`r7') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_2) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + addq r19, acc0, acc0 C U0 propagate nail + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 FINAL PROD-SUM + srl m1a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + bis r31, m1b, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + unop + srl r19,NUMB_BITS, r19 C U1 extract nail part + stq r28, -8(rp) C L1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + bis r31, m1b, acc1 + and r19,numb_mask, r28 C extract limb + + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, r0 + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm new file mode 100644 index 0000000..a1ffb68 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm @@ -0,0 +1,169 @@ +dnl Alpha ev6 nails mpn_addmul_3. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 3.0 cycles/limb. + +C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c). + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_3) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): ldq rlimb, 0(rp) C L1 + ldq ulimb, 0(up) C L0 + bis r31, r31, r31 C U0 nop + addq r19, acc0, acc0 C U1 propagate nail + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L0 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + bis r31, r31, r31 C L1 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L0 + bis r31, m2b, acc2 C L1 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + stq r28, -8(rp) C L + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + bis r31, m2b, acc2 + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, m0a + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm new file mode 100644 index 0000000..77e02a4 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm @@ -0,0 +1,210 @@ +dnl Alpha ev6 nails mpn_addmul_4. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 2.5 cycles/limb. + +C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding +C to 3.24 insn/cycle. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r12') +define(`m3b',`r13') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') +define(`acc3',`r14') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') +define(`v3',`r15') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(4-63) + +ASM_START() +PROLOGUE(mpn_addmul_4) + lda r30, -240(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + ldq v3, 24(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, acc3 C zero acc3 + sll v3,NAIL_BITS, v3 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + mulq v3, ulimb, m3a C U1 + umulh v3, ulimb, m3b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + addq r19, acc0, acc0 C U0 propagate nail + + bis r31, r31, r31 C L0 nop + bis r31, r31, r31 C U1 nop + bis r31, r31, r31 C L1 nop + bis r31, r31, r31 C U0 nop + + lda rp, 8(rp) C L0 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L1 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L0 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L1 nop + + addq rlimb, r19, r19 C L0 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L1 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L0 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L1 extract numb part + + bis r31, r31, r31 C L0 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L1 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L1 + addq m2b, acc3, acc2 C L0 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + bis r31, r31, r31 C L0 nop + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) C L1 + mulq v3, ulimb, m3a C U1 + + addq r8, acc2, acc2 C L0 + bis r31, m3b, acc3 C L1 + umulh v3, ulimb, m3b C U1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) C FIXME: DELETE + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + addq m2b, acc3, acc2 + srl r19,NUMB_BITS, r19 C extract nail + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) + addq r8, acc2, acc2 + bis r31, m3b, acc3 + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, acc2 + + and acc2,numb_mask, r28 + stq r28, 16(rp) + srl acc2,NUMB_BITS, r19 + addq r19, acc3, r0 + + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + lda r30, 240(r30) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm new file mode 100644 index 0000000..f658677 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm @@ -0,0 +1,233 @@ +dnl Alpha ev6 nails mpn_add_n and mpn_sub_n. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb +dnl with 8-way unrolling. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n',`r19') + +define(`rl0',`r0') +define(`rl1',`r1') +define(`rl2',`r2') +define(`rl3',`r3') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r6') +define(`ul3',`r7') + +define(`vl0',`r22') +define(`vl1',`r23') +define(`vl2',`r24') +define(`vl3',`r25') + +define(`numb_mask',`r21') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`CYSH',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ifdef(`OPERATION_add_n', ` + define(`OP', addq) + define(`CYSH',`GMP_NUMB_BITS') + define(`func', mpn_add_n)') +ifdef(`OPERATION_sub_n', ` + define(`OP', subq) + define(`CYSH',63) + define(`func', mpn_sub_n)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + bis r31, r31, r20 + + and n, 3, r25 + lda n, -4(n) + beq r25, L(ge4) + +L(lp0): ldq ul0, 0(up) + lda up, 8(up) + ldq vl0, 0(vp) + lda vp, 8(vp) + lda rp, 8(rp) + lda r25, -1(r25) + OP ul0, vl0, rl0 + OP rl0, r20, rl0 + and rl0, numb_mask, r28 + stq r28, -8(rp) + srl rl0, CYSH, r20 + bne r25, L(lp0) + + blt n, L(ret) + +L(ge4): ldq ul0, 0(up) + ldq vl0, 0(vp) + ldq ul1, 8(up) + ldq vl1, 8(vp) + ldq ul2, 16(up) + ldq vl2, 16(vp) + ldq ul3, 24(up) + ldq vl3, 24(vp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + bge n, L(ge8) + + OP ul0, vl0, rl0 C main-add 0 + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + br r31, L(cj0) + +L(ge8): OP ul0, vl0, rl0 C main-add 0 + ldq ul0, 0(up) + ldq vl0, 0(vp) + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + lda rp, 32(rp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + blt n, L(end) + + ALIGN(32) +L(top): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + ldq ul0, 0(up) + ldq vl0, 0(vp) + + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + bis r31, r31, r31 + + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) + bis r31, r31, r31 + + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + bis r31, r31, r31 + + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + bis r31, r31, r31 + + bis r31, r31, r31 + lda n, -4(n) + lda up, 32(up) + lda vp, 32(vp) + + bis r31, r31, r31 + bis r31, r31, r31 + lda rp, 32(rp) + bge n, L(top) + +L(end): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) +L(cj0): OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + + srl rl3, CYSH, r20 C gen cy 3 + and rl3,numb_mask, r28 + stq r27, 16(rp) + stq r28, 24(rp) + +L(ret): and r20, 1, r0 + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h new file mode 100644 index 0000000..7949fe8 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h @@ -0,0 +1,72 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */ + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 236 + +#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */ +#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ +#define SQR_TOOM3_THRESHOLD 120 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIV_DC_THRESHOLD 48 +#define POWM_THRESHOLD 113 + +#define HGCD_THRESHOLD 78 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 392 +#define JACOBI_BASE_METHOD 1 + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */ +#define USE_PREINV_MOD_1 0 /* no preinv with nails */ +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_THRESHOLD 6336 + +#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 } +#define MUL_FFT_MODF_THRESHOLD 488 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 } +#define SQR_FFT_MODF_THRESHOLD 480 +#define SQR_FFT_THRESHOLD 2976 diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm new file mode 100644 index 0000000..da2ee3d --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm @@ -0,0 +1,364 @@ +dnl Alpha ev6 nails mpn_mul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.25 + +C TODO +C * Reroll loop for 3.0 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ASM_START() +PROLOGUE(mpn_mul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m1a,NAIL_BITS, t0 + addq t0, r31, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + lda n, -4(n) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm new file mode 100644 index 0000000..f473a59 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_submul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_submul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + subq rl1, t0, acc1 + and acc1,numb_mask, r28 + sra acc1,NUMB_BITS, t1 + stq r28, 24(rp) + subq m1b, t1, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + subq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + subq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + subq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + subq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + addq t1, acc1, acc1 + sra acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + subq m1b, t1, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/slot.pl b/gmp-6.3.0/mpn/alpha/ev6/slot.pl new file mode 100755 index 0000000..a4c8a36 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/slot.pl @@ -0,0 +1,318 @@ +#!/usr/bin/perl -w + +# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: slot.pl [filename.o]... +# +# Run "objdump" to produce a disassembly of the given object file(s) and +# annotate the output with "U" or "L" slotting which Alpha EV6 will use. +# +# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as +# a reminder that it wasn't a fixed requirement that gave the U or L, but +# the octaword slotting rules. +# +# If an instruction is not recognised, that octaword does not get any U/L +# shown, only lower-case "u", "l" or "e" for the instructions which are +# known. Add any unknown instructions to %optable below. + + +use strict; + +# The U or L which various instructions demand, or E if either. +# +my %optable = + ( + 'addq' => 'E', + 'and' => 'E', + 'andnot' => 'E', + 'beq' => 'U', + 'bge' => 'U', + 'bgt' => 'U', + 'bic' => 'E', + 'bis' => 'E', + 'blt' => 'U', + 'bne' => 'U', + 'br' => 'L', + 'clr' => 'E', + 'cmpule' => 'E', + 'cmpult' => 'E', + 'cmpeq' => 'E', + 'cmoveq' => 'E', + 'cmovne' => 'E', + 'ctpop' => 'U', + 'ctlz' => 'U', + 'cttz' => 'U', + 'extbl' => 'U', + 'extlh' => 'U', + 'extll' => 'U', + 'extqh' => 'U', + 'extql' => 'U', + 'extwh' => 'U', + 'extwl' => 'U', + 'jsr' => 'L', + 'lda' => 'E', + 'ldah' => 'E', + 'ldbu' => 'L', + 'ldl' => 'L', + 'ldq' => 'L', + 'ldt' => 'L', + 'ret' => 'L', + 'mov' => 'E', + 'mull' => 'U', + 'mulq' => 'U', + 'negq' => 'E', + 'nop' => 'E', + 'not' => 'E', + 's8addq' => 'E', + 's8subq' => 'E', + # 'sextb' => ? + # 'sextl' => ? + 'sll' => 'U', + 'srl' => 'U', + 'stq' => 'L', + 'subq' => 'E', + 'umulh' => 'U', + 'unop' => 'E', + 'xor' => 'E', + ); + +# Slottings used for a given pattern of U/L/E in an octaword. This is as +# per the "Ebox Slotting" section of the EV6 hardware reference manual. +# +my %slottable = + ( + 'EEEE' => 'ULUL', + 'EEEL' => 'ULUL', + 'EEEU' => 'ULLU', + 'EELE' => 'ULLU', + 'EELL' => 'UULL', + 'EELU' => 'ULLU', + 'EEUE' => 'ULUL', + 'EEUL' => 'ULUL', + 'EEUU' => 'LLUU', + 'ELEE' => 'ULUL', + 'ELEL' => 'ULUL', + 'ELEU' => 'ULLU', + 'ELLE' => 'ULLU', + 'ELLL' => 'ULLL', + 'ELLU' => 'ULLU', + 'ELUE' => 'ULUL', + 'ELUL' => 'ULUL', + + 'LLLL' => 'LLLL', + 'LLLU' => 'LLLU', + 'LLUE' => 'LLUU', + 'LLUL' => 'LLUL', + 'LLUU' => 'LLUU', + 'LUEE' => 'LULU', + 'LUEL' => 'LUUL', + 'LUEU' => 'LULU', + 'LULE' => 'LULU', + 'LULL' => 'LULL', + 'LULU' => 'LULU', + 'LUUE' => 'LUUL', + 'LUUL' => 'LUUL', + 'LUUU' => 'LUUU', + 'UEEE' => 'ULUL', + 'UEEL' => 'ULUL', + 'UEEU' => 'ULLU', + + 'ELUU' => 'LLUU', + 'EUEE' => 'LULU', + 'EUEL' => 'LUUL', + 'EUEU' => 'LULU', + 'EULE' => 'LULU', + 'EULL' => 'UULL', + 'EULU' => 'LULU', + 'EUUE' => 'LUUL', + 'EUUL' => 'LUUL', + 'EUUU' => 'LUUU', + 'LEEE' => 'LULU', + 'LEEL' => 'LUUL', + 'LEEU' => 'LULU', + 'LELE' => 'LULU', + 'LELL' => 'LULL', + 'LELU' => 'LULU', + 'LEUE' => 'LUUL', + 'LEUL' => 'LUUL', + 'LEUU' => 'LLUU', + 'LLEE' => 'LLUU', + 'LLEL' => 'LLUL', + 'LLEU' => 'LLUU', + 'LLLE' => 'LLLU', + + 'UELE' => 'ULLU', + 'UELL' => 'UULL', + 'UELU' => 'ULLU', + 'UEUE' => 'ULUL', + 'UEUL' => 'ULUL', + 'UEUU' => 'ULUU', + 'ULEE' => 'ULUL', + 'ULEL' => 'ULUL', + 'ULEU' => 'ULLU', + 'ULLE' => 'ULLU', + 'ULLL' => 'ULLL', + 'ULLU' => 'ULLU', + 'ULUE' => 'ULUL', + 'ULUL' => 'ULUL', + 'ULUU' => 'ULUU', + 'UUEE' => 'UULL', + 'UUEL' => 'UULL', + 'UUEU' => 'UULU', + 'UULE' => 'UULL', + 'UULL' => 'UULL', + 'UULU' => 'UULU', + 'UUUE' => 'UUUL', + 'UUUL' => 'UUUL', + 'UUUU' => 'UUUU', + ); + +# Check all combinations of U/L/E are present in %slottable. +sub coverage { + foreach my $a ('U', 'L', 'E') { + foreach my $b ('U', 'L', 'E') { + foreach my $c ('U', 'L', 'E') { + foreach my $d ('U', 'L', 'E') { + my $x = $a . $b . $c . $d; + if (! defined $slottable{$x}) { + print "slottable missing: $x\n" + } + } + } + } + } +} + +# Certain consistency checks for %slottable. +sub check { + foreach my $x (keys %slottable) { + my $a = substr($x,0,1); + my $b = substr($x,1,1); + my $c = substr($x,2,1); + my $d = substr($x,3,1); + my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E'); + my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L'); + my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U'); + + my $got = $slottable{$x}; + my $want = $x; + + if ($es == 0) { + + } elsif ($es == 1) { + # when only one E, it's mapped to whichever of U or L is otherwise + # used the least + if ($ls > $us) { + $want =~ s/E/U/; + } else { + $want =~ s/E/L/; + } + } elsif ($es == 2) { + # when two E's and two U, then the E's map to L; vice versa for two E + # and two L + if ($ls == 2) { + $want =~ s/E/U/g; + } elsif ($us == 2) { + $want =~ s/E/L/g; + } else { + next; + } + } elsif ($es == 3) { + next; + + } else { # $es == 4 + next; + } + + if ($want ne $got) { + print "slottable $x want $want got $got\n"; + } + } +} + +sub disassemble { + my ($file) = @_; + + open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n"; + + my (%pre, %post, %type); + while () { + my $line = $_ . ""; + + if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) { + my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4); + + my $this_type = $optable{$opcode}; + if (! defined ($this_type)) { $this_type = ' '; } + + $pre{$addr} = $this_pre; + $post{$addr} = $this_post; + $type{$addr} = $this_type; + + if ($addr eq 'c') { + my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' '); + + my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'}; + $str = $slottable{$str}; + if (defined $str) { + $slot{'c'} = substr($str,0,1); + $slot{'8'} = substr($str,1,1); + $slot{'4'} = substr($str,2,1); + $slot{'0'} = substr($str,3,1); + } + + foreach my $i ('0', '4', '8', 'c') { + if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; } + print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, ' ', $post{$i}, "\n"; + } + + %pre = (); + %type = (); + %post = (); + } + } + } + + close IN || die "Error from objdump (or objdump not available)\n"; +} + +coverage(); +check(); + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + die +} + +foreach (@files) { + disassemble($_); +} diff --git a/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm b/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm new file mode 100644 index 0000000..a35ba40 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm @@ -0,0 +1,283 @@ +dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 5.4 +C EV6: 2.125 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C vp r18 +C n r19 +C cy r20 (for mpn_add_nc) + +C TODO +C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) +C Use multi-pronged feed-in. +C Perform additional micro-tuning + +C This code was written in cooperation with ev6 pipeline expert Steve Root. + +C Pair loads and stores where possible +C Store pairs oct-aligned where possible (didn't need it here) +C Stores are delayed every third cycle +C Loads and stores are delayed by fills +C U stays still, put code there where possible (note alternation of U1 and U0) +C L moves because of loads and stores +C Note dampers in L to limit damage + +C This odd-looking optimization expects that were having random bits in our +C data, so that a pure zero result is unlikely. so we penalize the unlikely +C case to help the common case. + +define(`u0', `r0') define(`u1', `r3') +define(`v0', `r1') define(`v1', `r4') + +define(`cy0', `r20') define(`cy1', `r21') + +MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(mpn_sub_nc) + br r31, $entry +EPILOGUE() +PROLOGUE(mpn_sub_n) + bis r31, r31, cy0 C clear carry in +$entry: cmpult r19, 5, r22 C L1 move counter + ldq u1, 0(r17) C L0 get next ones + ldq v1, 0(r18) C L1 + bne r22, $Lsmall + + ldq u0, 8(r17) C L0 get next ones + ldq v0, 8(r18) C L1 + subq u1, v1, r5 C U0 sub two data + + cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 16(r17) C L0 get next ones + ldq v1, 16(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + subq r5, cy0, r24 C U0 borrow in + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix5f C U0 fix exact zero +$ret5f: ldq u0, 24(r17) C L0 get next ones + ldq v0, 24(r18) C L1 + + subq r8, r23, r25 C U1 borrow from last + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix6f C U1 fix exact zero +$ret6f: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 32(r17) C L0 get next ones + ldq v1, 32(r18) C L1 + + lda r17, 40(r17) C L0 move pointer + lda r18, 40(r18) C L1 move pointer + + lda r16, -8(r16) + lda r19, -13(r19) C L1 move counter + blt r19, $Lend C U1 loop control + + +C Main loop. 8-way unrolled. + ALIGN(16) +$Loop: subq u0, v0, r2 C U1 sub two data + stq r24, 8(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, 16(r16) C L1 pair + + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix7 C U0 fix exact 0 +$ret7: ldq u0, 0(r17) C L0 get next ones + ldq v0, 0(r18) C L1 + + bis r31, r31, r31 C L damp out + subq r2, r23, r25 C U1 borrow from last + bis r31, r31, r31 C L moves in L ! + subq u1, v1, r5 C U0 sub two data + + beq r2, $fix0 C U1 fix exact zero +$ret0: cmpult u1, v1, cy0 C U0 did it borrow + ldq u1, 8(r17) C L0 get next ones + ldq v1, 8(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + stq r24, 24(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 32(r16) C L1 + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix1 C U0 fix exact zero +$ret1: ldq u0, 16(r17) C L0 get next ones + ldq v0, 16(r18) C L1 + + lda r16, 64(r16) C L0 move pointer + subq r8, cy0, r25 C U1 borrow from last + lda r19, -8(r19) C L1 move counter + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix2 C U1 fix exact zero +$ret2: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 24(r17) C L0 get next ones + ldq v1, 24(r18) C L1 + + subq u0, v0, r2 C U1 sub two data + stq r24, -24(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, -16(r16) C L1 pair + + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix3 C U0 fix exact 0 +$ret3: ldq u0, 32(r17) C L0 get next ones + ldq v0, 32(r18) C L1 + + bis r31, r31, r31 C L damp out + subq r2, r23, r25 C U1 borrow from last + bis r31, r31, r31 C L moves in L ! + subq u1, v1, r5 C U0 sub two data + + beq r2, $fix4 C U1 fix exact zero +$ret4: cmpult u1, v1, cy0 C U0 did it borrow + ldq u1, 40(r17) C L0 get next ones + ldq v1, 40(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + stq r24, -8(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 0(r16) C L1 + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix5 C U0 fix exact zero +$ret5: ldq u0, 48(r17) C L0 get next ones + ldq v0, 48(r18) C L1 + + ldl r31, 256(r17) C L0 prefetch + subq r8, cy0, r25 C U1 borrow from last + ldl r31, 256(r18) C L1 prefetch + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix6 C U1 fix exact zero +$ret6: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 56(r17) C L0 get next ones + ldq v1, 56(r18) C L1 + + lda r17, 64(r17) C L0 move pointer + bis r31, r31, r31 C U + lda r18, 64(r18) C L1 move pointer + bge r19, $Loop C U1 loop control +C ==== main loop end + +$Lend: subq u0, v0, r2 C U1 sub two data + stq r24, 8(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, 16(r16) C L1 pair + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix7c C U0 fix exact 0 +$ret7c: subq r2, r23, r25 C U1 borrow from last + subq u1, v1, r5 C U0 sub two data + beq r2, $fix0c C U1 fix exact zero +$ret0c: cmpult u1, v1, cy0 C U0 did it borrow + stq r24, 24(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 32(r16) C L1 + beq r5, $fix1c C U0 fix exact zero +$ret1c: stq r24, 40(r16) C L0 put an answer + lda r16, 48(r16) C L0 move pointer + + lda r19, 8(r19) + beq r19, $Lret + + ldq u1, 0(r17) + ldq v1, 0(r18) +$Lsmall: + lda r19, -1(r19) + beq r19, $Lend0 + + ALIGN(8) +$Loop0: subq u1, v1, r2 C main sub + cmpult u1, v1, r8 C compute bw from last sub + ldq u1, 8(r17) + ldq v1, 8(r18) + subq r2, cy0, r5 C borrow sub + lda r17, 8(r17) + lda r18, 8(r18) + stq r5, 0(r16) + cmpult r2, cy0, cy0 C compute bw from last sub + lda r19, -1(r19) C decr loop cnt + bis r8, cy0, cy0 C combine bw from the two subs + lda r16, 8(r16) + bne r19, $Loop0 +$Lend0: subq u1, v1, r2 C main sub + subq r2, cy0, r5 C borrow sub + cmpult u1, v1, r8 C compute bw from last sub + cmpult r2, cy0, cy0 C compute bw from last sub + stq r5, 0(r16) + bis r8, cy0, r0 C combine bw from the two subs + ret r31,(r26),1 + + ALIGN(8) +$Lret: lda r0, 0(cy0) C copy borrow into return register + ret r31,(r26),1 + +$fix5f: bis r23, cy0, r23 C bring forward borrow + br r31, $ret5f +$fix6f: bis r22, r23, r22 C bring forward borrow + br r31, $ret6f +$fix0: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret0 +$fix1: bis cy0, cy1, cy0 C bring forward borrow + br r31, $ret1 +$fix2: bis r22, cy0, r22 C bring forward borrow + br r31, $ret2 +$fix3: bis r23, r22, r23 C bring forward borrow + br r31, $ret3 +$fix4: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret4 +$fix5: bis cy1, cy0, cy0 C bring forward borrow + br r31, $ret5 +$fix6: bis r22, cy0, r22 C bring forward borrow + br r31, $ret6 +$fix7: bis r23, r22, r23 C bring forward borrow + br r31, $ret7 +$fix0c: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret0c +$fix1c: bis cy0, cy1, cy0 C bring forward borrow + br r31, $ret1c +$fix7c: bis r23, r22, r23 C bring forward borrow + br r31, $ret7c + +EPILOGUE() +ASM_END() -- cgit v1.2.3