From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/pa64/README | 78 ++++ gmp-6.3.0/mpn/pa64/addmul_1.asm | 693 +++++++++++++++++++++++++++++++++++ gmp-6.3.0/mpn/pa64/aors_n.asm | 130 +++++++ gmp-6.3.0/mpn/pa64/aorslsh1_n.asm | 228 ++++++++++++ gmp-6.3.0/mpn/pa64/gmp-mparam.h | 247 +++++++++++++ gmp-6.3.0/mpn/pa64/lshift.asm | 114 ++++++ gmp-6.3.0/mpn/pa64/mul_1.asm | 646 +++++++++++++++++++++++++++++++++ gmp-6.3.0/mpn/pa64/rshift.asm | 111 ++++++ gmp-6.3.0/mpn/pa64/sqr_diagonal.asm | 191 ++++++++++ gmp-6.3.0/mpn/pa64/submul_1.asm | 700 ++++++++++++++++++++++++++++++++++++ gmp-6.3.0/mpn/pa64/udiv.asm | 125 +++++++ gmp-6.3.0/mpn/pa64/umul.asm | 97 +++++ 12 files changed, 3360 insertions(+) create mode 100644 gmp-6.3.0/mpn/pa64/README create mode 100644 gmp-6.3.0/mpn/pa64/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/pa64/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/pa64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa64/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa64/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa64/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/udiv.asm create mode 100644 gmp-6.3.0/mpn/pa64/umul.asm (limited to 'gmp-6.3.0/mpn/pa64') diff --git a/gmp-6.3.0/mpn/pa64/README b/gmp-6.3.0/mpn/pa64/README new file mode 100644 index 0000000..a51ce02 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/README @@ -0,0 +1,78 @@ +Copyright 1999, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + +This directory contains mpn functions for 64-bit PA-RISC 2.0. + +PIPELINE SUMMARY + +The PA8x00 processors have an orthogonal 4-way out-of-order pipeline. Each +cycle two ALU operations and two MEM operations can issue, but just one of the +MEM operations may be a store. The two ALU operations can be almost any +combination of non-memory operations. Unlike every other processor, integer +and fp operations are completely equal here; they both count as just ALU +operations. + +Unfortunately, some operations cause hickups in the pipeline. Combining +carry-consuming operations like ADD,DC with operations that does not set carry +like ADD,L cause long delays. Skip operations also seem to cause hickups. If +several ADD,DC are issued consecutively, or if plain carry-generating ADD feed +ADD,DC, stalling does not occur. We can effectively issue two ADD,DC +operations/cycle. + +Latency scheduling is not as important as making sure to have a mix of ALU and +MEM operations, but for full pipeline utilization, it is still a good idea to +do some amount of latency scheduling. + +Like for all other processors, RAW memory scheduling is critically important. +Since integer multiplication takes place in the floating-point unit, the GMP +code needs to handle this problem frequently. + +STATUS + +* mpn_lshift and mpn_rshift run at 1.5 cycles/limb on PA8000 and at 1.0 + cycles/limb on PA8500. With latency scheduling, the numbers could + probably be improved to 1.0 cycles/limb for all PA8x00 chips. + +* mpn_add_n and mpn_sub_n run at 2.0 cycles/limb on PA8000 and at about + 1.6875 cycles/limb on PA8500. With latency scheduling, this could + probably be improved to get close to 1.5 cycles/limb. A problem is the + stalling of carry-inputting instructions after instructions that do not + write to carry. + +* mpn_mul_1, mpn_addmul_1, and mpn_submul_1 run at between 5.625 and 6.375 + on PA8500 and later, and about a cycle/limb slower on older chips. The + code uses ADD,DC for adjacent limbs, and relies heavily on reordering. + + +REFERENCES + +Hewlett Packard, "64-Bit Runtime Architecture for PA-RISC 2.0", version 3.3, +October 1997. diff --git a/gmp-6.3.0/mpn/pa64/addmul_1.asm b/gmp-6.3.0/mpn/pa64/addmul_1.asm new file mode 100644 index 0000000..2cb9af9 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/addmul_1.asm @@ -0,0 +1,693 @@ +dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 7 +C 8500,8600,8700: 6.375 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_addmul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +define(`r000',`%r3') C + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C add r000, s000, s000 +C add,dc %r0, climb, climb +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C ldd 0(rp), r000 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + C +define(`r000',`%r1') C +define(`r064',`%r19') C +define(`r128',`%r20') C +define(`r192',`%r21') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + + add,dc ma128, s128, s128 C accum mid 2 + fldd 0(up), %fr4 + add,dc ma192, s192, s192 C accum mid 3 + fldd 8(up), %fr5 + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + add r000, s000, s000 C accum rlimb 0 + fldd 24(up), %fr7 + + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + std s128, 16(rp) + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + add r000, s000, s000 C accum rlimb 0 + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + add r000, s000, s000 C accum rlimb 0 + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/pa64/aors_n.asm b/gmp-6.3.0/mpn/pa64/aors_n.asm new file mode 100644 index 0000000..ab4536f --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/aors_n.asm @@ -0,0 +1,130 @@ +dnl HP-PA 2.0 mpn_add_n, mpn_sub_n + +dnl Copyright 1997, 2000, 2002, 2003, 2009, 2010 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500. It +dnl should be possible to reach the cache bandwidth 1.5 cycles/limb at least +dnl with PA8500. The problem now is stalling of the first ADD,DC after LDO, +dnl where the processor gets confused about where carry comes from. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`vp',`%r24') +define(`n',`%r23') + +ifdef(`OPERATION_add_n', ` + define(ADCSBC, `add,dc') + define(INITCY, `addi -1,%r22,%r0') + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBC, `sub,db') + define(INITCY, `subi 0,%r22,%r0') + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(func_nc) +ifdef(`HAVE_ABI_2_0w', +` b L(com) + nop +',` b L(com) + ldw -52(%r30), %r22 +') +EPILOGUE() +PROLOGUE(func) + ldi 0, %r22 +LDEF(com) + sub %r0, n, %r21 + depw,z %r21, 30, 3, %r28 C r28 = 2 * (-n & 7) + depw,z %r21, 28, 3, %r21 C r21 = 8 * (-n & 7) + sub up, %r21, up C offset up + sub vp, %r21, vp C offset vp + sub rp, %r21, rp C offset rp + blr %r28, %r0 C branch into loop + INITCY + +LDEF(loop) + ldd 0(up), %r20 + ldd 0(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 0(rp) +LDEF(7) ldd 8(up), %r21 + ldd 8(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 8(rp) +LDEF(6) ldd 16(up), %r20 + ldd 16(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 16(rp) +LDEF(5) ldd 24(up), %r21 + ldd 24(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 24(rp) +LDEF(4) ldd 32(up), %r20 + ldd 32(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 32(rp) +LDEF(3) ldd 40(up), %r21 + ldd 40(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 40(rp) +LDEF(2) ldd 48(up), %r20 + ldd 48(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 48(rp) +LDEF(1) ldd 56(up), %r21 + ldd 56(vp), %r19 + ADCSBC %r21, %r19, %r21 + ldo 64(up), up + std %r21, 56(rp) + ldo 64(vp), vp + addib,> -8, n, L(loop) + ldo 64(rp), rp + + add,dc %r0, %r0, %r29 +ifdef(`OPERATION_sub_n',` + subi 1, %r29, %r29 +') + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29, %r28 +',` ldi 0, %r28 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm b/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm new file mode 100644 index 0000000..2a55dde --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm @@ -0,0 +1,228 @@ +dnl PA64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). + +dnl Copyright 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 2 +C 8500,8600,8700: 1.75 + +C TODO +C * Write special feed-in code for each (n mod 8). (See the ia64 code.) +C * Try to make this run at closer to 1.5 c/l. +C * Set up register aliases (define(`u0',`%r19')). +C * Explicitly align loop. + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`vp',`%r24') +define(`n',`%r23') + +ifdef(`OPERATION_addlsh1_n',` + define(ADCSBC, `add,dc') + define(INITC, `ldi 0,') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADCSBC, `sub,db') + define(INITC, `ldi 1,') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ifdef(`HAVE_ABI_2_0w',` + define(LEVEL, `.level 2.0w') + define(RETREG, `%r28') + define(CLRRET1, `dnl') +') +ifdef(`HAVE_ABI_2_0n',` + define(LEVEL, `.level 2.0') + define(RETREG, `%r29') + define(CLRRET1, `ldi 0, %r28') +') + + LEVEL +PROLOGUE(func) + std,ma %r3, 0x100(%r30) C save reg + + INITC %r1 C init saved cy + +C Primitive code for the first (n mod 8) limbs: + extrd,u n, 63, 3, %r22 C count for loop0 + comib,= 0, %r22, L(unrolled) C skip loop0? + copy %r0, %r28 +LDEF(loop0) + ldd 0(vp), %r21 + ldo 8(vp), vp + ldd 0(up), %r19 + ldo 8(up), up + shrpd %r21, %r28, 63, %r31 + addi -1, %r1, %r0 C restore cy + ADCSBC %r19, %r31, %r29 + std %r29, 0(rp) + add,dc %r0, %r0, %r1 C save cy + copy %r21, %r28 + addib,> -1, %r22, L(loop0) + ldo 8(rp), rp + + addib,>= -8, n, L(unrolled) + addi -1, %r1, %r0 C restore cy + + shrpd %r0, %r28, 63, %r28 + ADCSBC %r0, %r28, RETREG +ifdef(`OPERATION_sublsh1_n', +` sub %r0, RETREG, RETREG') + CLRRET1 + + bve (%r2) + ldd,mb -0x100(%r30), %r3 + + +LDEF(unrolled) + std %r4, -0xf8(%r30) C save reg + ldd 0(vp), %r4 + std %r5, -0xf0(%r30) C save reg + ldd 8(vp), %r5 + std %r6, -0xe8(%r30) C save reg + ldd 16(vp), %r6 + std %r7, -0xe0(%r30) C save reg + + ldd 24(vp), %r7 + shrpd %r4, %r28, 63, %r31 + std %r8, -0xd8(%r30) C save reg + ldd 32(vp), %r8 + shrpd %r5, %r4, 63, %r4 + std %r9, -0xd0(%r30) C save reg + ldd 40(vp), %r9 + shrpd %r6, %r5, 63, %r5 + ldd 48(vp), %r3 + shrpd %r7, %r6, 63, %r6 + ldd 56(vp), %r28 + shrpd %r8, %r7, 63, %r7 + ldd 0(up), %r19 + shrpd %r9, %r8, 63, %r8 + ldd 8(up), %r20 + shrpd %r3, %r9, 63, %r9 + ldd 16(up), %r21 + shrpd %r28, %r3, 63, %r3 + ldd 24(up), %r22 + + nop C alignment FIXME + addib,<= -8, n, L(end) + addi -1, %r1, %r0 C restore cy +LDEF(loop) + ADCSBC %r19, %r31, %r29 + ldd 32(up), %r19 + std %r29, 0(rp) + ADCSBC %r20, %r4, %r29 + ldd 40(up), %r20 + std %r29, 8(rp) + ADCSBC %r21, %r5, %r29 + ldd 48(up), %r21 + std %r29, 16(rp) + ADCSBC %r22, %r6, %r29 + ldd 56(up), %r22 + std %r29, 24(rp) + ADCSBC %r19, %r7, %r29 + ldd 64(vp), %r4 + std %r29, 32(rp) + ADCSBC %r20, %r8, %r29 + ldd 72(vp), %r5 + std %r29, 40(rp) + ADCSBC %r21, %r9, %r29 + ldd 80(vp), %r6 + std %r29, 48(rp) + ADCSBC %r22, %r3, %r29 + std %r29, 56(rp) + + add,dc %r0, %r0, %r1 C save cy + + ldd 88(vp), %r7 + shrpd %r4, %r28, 63, %r31 + ldd 96(vp), %r8 + shrpd %r5, %r4, 63, %r4 + ldd 104(vp), %r9 + shrpd %r6, %r5, 63, %r5 + ldd 112(vp), %r3 + shrpd %r7, %r6, 63, %r6 + ldd 120(vp), %r28 + shrpd %r8, %r7, 63, %r7 + ldd 64(up), %r19 + shrpd %r9, %r8, 63, %r8 + ldd 72(up), %r20 + shrpd %r3, %r9, 63, %r9 + ldd 80(up), %r21 + shrpd %r28, %r3, 63, %r3 + ldd 88(up), %r22 + + ldo 64(vp), vp + ldo 64(rp), rp + ldo 64(up), up + addib,> -8, n, L(loop) + addi -1, %r1, %r0 C restore cy +LDEF(end) + ADCSBC %r19, %r31, %r29 + ldd 32(up), %r19 + std %r29, 0(rp) + ADCSBC %r20, %r4, %r29 + ldd 40(up), %r20 + std %r29, 8(rp) + ADCSBC %r21, %r5, %r29 + ldd 48(up), %r21 + std %r29, 16(rp) + ADCSBC %r22, %r6, %r29 + ldd 56(up), %r22 + std %r29, 24(rp) + ADCSBC %r19, %r7, %r29 + ldd -0xf8(%r30), %r4 C restore reg + std %r29, 32(rp) + ADCSBC %r20, %r8, %r29 + ldd -0xf0(%r30), %r5 C restore reg + std %r29, 40(rp) + ADCSBC %r21, %r9, %r29 + ldd -0xe8(%r30), %r6 C restore reg + std %r29, 48(rp) + ADCSBC %r22, %r3, %r29 + ldd -0xe0(%r30), %r7 C restore reg + std %r29, 56(rp) + + shrpd %r0, %r28, 63, %r28 + ldd -0xd8(%r30), %r8 C restore reg + ADCSBC %r0, %r28, RETREG +ifdef(`OPERATION_sublsh1_n', +` sub %r0, RETREG, RETREG') + CLRRET1 + + ldd -0xd0(%r30), %r9 C restore reg + bve (%r2) + ldd,mb -0x100(%r30), %r3 C restore reg +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa64/gmp-mparam.h b/gmp-6.3.0/mpn/pa64/gmp-mparam.h new file mode 100644 index 0000000..c2719c3 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/gmp-mparam.h @@ -0,0 +1,247 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 440MHz PA8200 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD 21 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 31 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 179 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 296 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 130 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 229 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 54 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 58 +#define SQR_TOOM3_THRESHOLD 153 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 2,23,228,1084 + +#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127,10}, { 71, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383, 7}, { 767,10}, { 103, 9}, { 207, 8}, \ + { 415, 7}, { 831,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 543, 7}, { 1087, 6}, \ + { 2175,10}, { 143, 9}, { 287, 8}, { 575,11}, \ + { 79, 9}, { 319, 8}, { 639, 7}, { 1279, 9}, \ + { 335, 8}, { 671,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \ + { 1663,11}, { 111,10}, { 223, 9}, { 447, 8}, \ + { 895,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 543, 8}, { 1087, 7}, { 2175,10}, { 287, 9}, \ + { 575, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \ + { 639, 8}, { 1279,10}, { 335, 9}, { 671, 8}, \ + { 1343, 9}, { 703, 8}, { 1407,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207, 9}, { 831, 8}, \ + { 1663,11}, { 223,10}, { 447, 9}, { 959,13}, \ + { 63,12}, { 127,11}, { 255, 8}, { 2047,11}, \ + { 271,10}, { 543, 9}, { 1087, 8}, { 2175,11}, \ + { 287,10}, { 575, 9}, { 1215, 8}, { 2431,11}, \ + { 319,10}, { 671, 9}, { 1343, 8}, { 2687,11}, \ + { 351,10}, { 703, 9}, { 1471, 8}, { 2943,12}, \ + { 191,11}, { 383, 8}, { 3071,11}, { 415,10}, \ + { 831, 9}, { 1663,11}, { 479,10}, { 959, 9}, \ + { 1919, 8}, { 3839,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \ + { 607,10}, { 1215, 9}, { 2431, 8}, { 4863,12}, \ + { 319,11}, { 671,10}, { 1343,13}, { 191, 9}, \ + { 3071,12}, { 415,11}, { 831,10}, { 1663, 8}, \ + { 6655, 9}, { 3455,12}, { 447, 9}, { 3583,13}, \ + { 255,12}, { 511,11}, { 1023,10}, { 2175,13}, \ + { 319,11}, { 1279,12}, { 671,10}, { 2815,12}, \ + { 735,10}, { 2943, 9}, { 5887,13}, { 383,12}, \ + { 767,11}, { 1535,10}, { 3071,13}, { 447,10}, \ + { 3583,12}, { 959,13}, { 511,12}, { 1087,13}, \ + { 639,12}, { 1343,13}, { 767,11}, { 3071,13}, \ + { 831,12}, { 1663,11}, { 3455,10}, { 6911,13}, \ + { 895,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2303,13}, { 1215,12}, { 2431,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,11}, { 5375,13}, { 1407,12}, { 2815,11}, \ + { 5631,12}, { 2943,13}, { 1535,12}, { 3199,13}, \ + { 1663,12}, { 3327,13}, { 1727,14}, { 895,13}, \ + { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,15}, { 767,14}, { 1535,13}, \ + { 3071,14}, { 1663,13}, { 3327,14}, { 1791,13}, \ + { 3583,14}, { 1919,15}, { 1023,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 252 +#define MUL_FFT_THRESHOLD 2368 + +#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 284, 5}, { 9, 4}, { 21, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255, 7}, { 511,10}, \ + { 71, 8}, { 287, 7}, { 575,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191, 8}, { 383, 7}, \ + { 767,10}, { 103, 9}, { 207, 8}, { 415,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 543, 7}, { 1087, 8}, { 575, 7}, { 1151,11}, \ + { 79, 8}, { 639, 7}, { 1279, 9}, { 335, 8}, \ + { 671, 7}, { 1343,10}, { 175, 8}, { 703, 7}, \ + { 1407,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \ + { 1663, 9}, { 447, 8}, { 895,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 543, 8}, { 1087, 7}, \ + { 2175, 9}, { 575, 8}, { 1151,10}, { 303, 9}, \ + { 607, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \ + { 639, 8}, { 1279, 9}, { 671, 8}, { 1343, 7}, \ + { 2687,10}, { 351, 9}, { 703, 8}, { 1407,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831, 8}, { 1663,11}, \ + { 223,10}, { 447, 9}, { 895,13}, { 63,11}, \ + { 255,10}, { 543, 8}, { 2175,11}, { 287,10}, \ + { 575, 9}, { 1151,10}, { 607, 9}, { 1215, 8}, \ + { 2431,11}, { 319, 9}, { 1279,10}, { 671, 9}, \ + { 1343, 8}, { 2687,11}, { 351,10}, { 703, 9}, \ + { 1407,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 831, 9}, { 1663,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479, 9}, { 1919, 8}, { 3839,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087, 9}, { 2175,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,10}, { 1215, 9}, { 2431, 8}, \ + { 4863,10}, { 1279,11}, { 671,10}, { 1343, 9}, \ + { 2687,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191, 9}, { 3071, 7}, { 12287,11}, \ + { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447, 8}, { 7167,12}, { 479, 9}, { 3839,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,10}, { 2175, 9}, { 4607,11}, { 1215,10}, \ + { 2431,11}, { 1279,10}, { 2559,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,10}, { 3199, 9}, \ + { 6399,12}, { 895,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,13}, { 575,12}, { 1151,10}, \ + { 4607,13}, { 639,12}, { 1279,11}, { 2687,14}, \ + { 383,13}, { 767,11}, { 3071,12}, { 1599,13}, \ + { 895,12}, { 1791,11}, { 3583,13}, { 959,15}, \ + { 255,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \ + { 1471,11}, { 5887,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1599,12}, { 3199,13}, { 1663,12}, \ + { 3327,13}, { 1727,14}, { 895,13}, { 1791,12}, \ + { 3583,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,12}, { 4607,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,13}, \ + { 3199,14}, { 1663,13}, { 3327,14}, { 1791,13}, \ + { 3583,14}, { 1919,15}, { 1023,14}, { 2047,13}, \ + { 4095,14}, { 2303,13}, { 4607,14}, { 2431,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 257 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 113 +#define MULLO_MUL_N_THRESHOLD 4658 + +#define DC_DIV_QR_THRESHOLD 123 +#define DC_DIVAPPR_Q_THRESHOLD 372 +#define DC_BDIV_QR_THRESHOLD 142 +#define DC_BDIV_Q_THRESHOLD 312 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 315 +#define INV_APPR_THRESHOLD 315 + +#define BINV_NEWTON_THRESHOLD 360 +#define REDC_1_TO_REDC_N_THRESHOLD 101 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 93 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD_THRESHOLD 234 +#define HGCD_APPR_THRESHOLD 300 +#define HGCD_REDUCE_THRESHOLD 1553 +#define GCD_DC_THRESHOLD 684 +#define GCDEXT_DC_THRESHOLD 525 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 21 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 1951 +#define SET_STR_PRECOMPUTE_THRESHOLD 4034 diff --git a/gmp-6.3.0/mpn/pa64/lshift.asm b/gmp-6.3.0/mpn/pa64/lshift.asm new file mode 100644 index 0000000..c0fc292 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/lshift.asm @@ -0,0 +1,114 @@ +dnl HP-PA 2.0 mpn_lshift -- Left shift. + +dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') +define(`cnt',`%r23') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_lshift) + shladd n, 3, up, up + shladd n, 3, rp, rp + subi 64, cnt, cnt + mtsar cnt + ldd -8(up), %r21 + addib,= -1, n, L(end) + shrpd %r0, %r21, %sar, %r29 C compute carry out limb + depw,z n, 31, 3, %r28 C r28 = (size & 7) + sub %r0, n, %r22 + depw,z %r22, 28, 3, %r22 C r22 = 8 * (-size & 7) + add up, %r22, up C offset up + blr %r28, %r0 C branch into jump table + add rp, %r22, rp C offset rp + b L(0) + nop + b L(1) + copy %r21, %r20 + b L(2) + nop + b L(3) + copy %r21, %r20 + b L(4) + nop + b L(5) + copy %r21, %r20 + b L(6) + nop + b L(7) + copy %r21, %r20 + +LDEF(loop) +LDEF(0) ldd -16(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -8(rp) +LDEF(7) ldd -24(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -16(rp) +LDEF(6) ldd -32(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -24(rp) +LDEF(5) ldd -40(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -32(rp) +LDEF(4) ldd -48(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -40(rp) +LDEF(3) ldd -56(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -48(rp) +LDEF(2) ldd -64(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -56(rp) +LDEF(1) ldd -72(up), %r21 + ldo -64(up), up + shrpd %r20, %r21, %sar, %r20 + std %r20, -64(rp) + addib,> -8, n, L(loop) + ldo -64(rp), rp + +LDEF(end) + shrpd %r21, %r0, %sar, %r21 + std %r21, -8(rp) + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29,%r28 +',` extrd,u %r29, 31, 32, %r28 +') +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/pa64/mul_1.asm b/gmp-6.3.0/mpn/pa64/mul_1.asm new file mode 100644 index 0000000..6935c23 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/mul_1.asm @@ -0,0 +1,646 @@ +dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 6.5 +C 8500,8600,8700: 5.625 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_mul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +C define(`r000',`%r3') C FIXME don't save r3 for n < 4. + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + add ma000, s000, s000 C accum mid 0 + fldd 0(up), %fr4 + add,dc ma064, s064, s064 C accum mid 1 + std s000, 0(rp) + + add,dc ma128, s128, s128 C accum mid 2 + fldd 8(up), %fr5 + add,dc ma192, s192, s192 C accum mid 3 + std s064, 8(rp) + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + std s128, 16(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + fldd 24(up), %fr7 + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + add ma000, s000, s000 C accum mid 0 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + std s000, 0(rp) + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + add ma000, s000, s000 C accum mid 0 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + std s000, 0(rp) + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/pa64/rshift.asm b/gmp-6.3.0/mpn/pa64/rshift.asm new file mode 100644 index 0000000..cfc242e --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/rshift.asm @@ -0,0 +1,111 @@ +dnl HP-PA 2.0 mpn_rshift -- Right shift. + +dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') +define(`cnt',`%r23') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_rshift) + mtsar cnt + ldd 0(up), %r21 + addib,= -1, n, L(end) + shrpd %r21, %r0, %sar, %r29 C compute carry out limb + depw,z n, 31, 3, %r28 C r28 = (size & 7) + sub %r0, n, %r22 + depw,z %r22, 28, 3, %r22 C r22 = 8 * (-size & 7) + sub up, %r22, up C offset up + blr %r28, %r0 C branch into jump table + sub rp, %r22, rp C offset rp + b L(0) + nop + b L(1) + copy %r21, %r20 + b L(2) + nop + b L(3) + copy %r21, %r20 + b L(4) + nop + b L(5) + copy %r21, %r20 + b L(6) + nop + b L(7) + copy %r21, %r20 + +LDEF(loop) +LDEF(0) ldd 8(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 0(rp) +LDEF(7) ldd 16(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 8(rp) +LDEF(6) ldd 24(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 16(rp) +LDEF(5) ldd 32(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 24(rp) +LDEF(4) ldd 40(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 32(rp) +LDEF(3) ldd 48(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 40(rp) +LDEF(2) ldd 56(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 48(rp) +LDEF(1) ldd 64(up), %r21 + ldo 64(up), up + shrpd %r21, %r20, %sar, %r20 + std %r20, 56(rp) + addib,> -8, n, L(loop) + ldo 64(rp), rp + +LDEF(end) + shrpd %r0, %r21, %sar, %r21 + std %r21, 0(rp) + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29,%r28 +',` extrd,u %r29, 31, 32, %r28 +') +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm new file mode 100644 index 0000000..f6fadc9 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm @@ -0,0 +1,191 @@ +dnl HP-PA 2.0 64-bit mpn_sqr_diagonal. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on +dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room +dnl for optimization. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +define(`p00',`%r28') +define(`p32',`%r29') +define(`p64',`%r31') +define(`t0',`%r19') +define(`t1',`%r20') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_sqr_diagonal) + ldo 128(%r30),%r30 + + fldds,ma 8(up),%fr8 + addib,= -1,n,L(end1) + nop + fldds,ma 8(up),%fr4 + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + addib,= -1,n,L(end2) + ldo 16(rp),rp + +LDEF(loop) + fldds,ma 8(up),%fr8 C load next up limb + xmpyu %fr4l,%fr4r,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs + fstd %fr5,0(rp) + xmpyu %fr4l,%fr4l,%fr7 + fstd %fr7,8(rp) + ldd -120(%r30),p32 + ldd -16(rp),p00 C accumulate in int regs + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + addib,= -1,n,L(exit) + ldo 16(rp),rp + + fldds,ma 8(up),%fr4 + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + addib,<> -1,n,L(loop) + ldo 16(rp),rp + +LDEF(end2) + xmpyu %fr4l,%fr4r,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr4r,%fr4r,%fr5 + fstd %fr5,0(rp) + xmpyu %fr4l,%fr4l,%fr7 + fstd %fr7,8(rp) + ldd -120(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + ldo 16(rp),rp + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 + +LDEF(exit) + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + ldo 16(rp),rp + ldd -120(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 + +LDEF(end1) + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-128(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldo 16(rp),rp + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa64/submul_1.asm b/gmp-6.3.0/mpn/pa64/submul_1.asm new file mode 100644 index 0000000..f8a1968 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/submul_1.asm @@ -0,0 +1,700 @@ +dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 7 +C 8500,8600,8700: 6.5 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_submul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +define(`r000',`%r3') C + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C sub r000, s000, s000 +C sub,db %r0, climb, climb +C sub %r0, climb, climb +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C ldd 0(rp), r000 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + C +define(`r000',`%r1') C +define(`r064',`%r19') C +define(`r128',`%r20') C +define(`r192',`%r21') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + + add,dc ma128, s128, s128 C accum mid 2 + fldd 0(up), %fr4 + add,dc ma192, s192, s192 C accum mid 3 + fldd 8(up), %fr5 + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + sub r000, s000, s000 C accum rlimb 0 + fldd 24(up), %fr7 + + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + std s128, 16(rp) + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + sub r000, s000, s000 C accum rlimb 0 + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + sub r000, s000, s000 C accum rlimb 0 + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/pa64/udiv.asm b/gmp-6.3.0/mpn/pa64/udiv.asm new file mode 100644 index 0000000..1380a85 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/udiv.asm @@ -0,0 +1,125 @@ +dnl HP-PA 2.0 64-bit mpn_udiv_qrnnd_r. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This runs at about 280 cycles on both PA8000 and PA8500, corresponding to a +C bit more than 4 cycles/bit. + +C INPUT PARAMETERS +define(`n1',`%r26') +define(`n0',`%r25') +define(`d',`%r24') +define(`remptr',`%r23') + +define(`q',`%r28') +define(`dn',`%r29') + +define(`old_divstep', + `add,dc n0,n0,n0 + add,dc n1,n1,n1 + sub,*<< n1,d,%r22 + copy %r22,n1') + +define(`divstep', + `add n0,n0,n0 + add,dc n1,n1,n1 + sub n1,d,%r1 + add,dc q,q,q + cmpclr,*<< n1,d,%r0 + copy %r1,n1 +') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_udiv_qrnnd_r) +ifdef(`HAVE_ABI_2_0n', +` depd %r25,31,32,%r26 + depd %r23,31,32,%r24 + copy %r24,%r25 + ldd -56(%r30),%r24 + ldw -60(%r30),%r23 +') + ldi 0,q + cmpib,*>= 0,d,L(large_divisor) + ldi 8,%r31 C setup loop counter + + sub %r0,d,dn +LDEF(Loop) + divstep divstep divstep divstep divstep divstep divstep divstep + addib,<> -1,%r31,L(Loop) + nop + +ifdef(`HAVE_ABI_2_0n', +` copy %r28,%r29 + extrd,u %r28,31,32,%r28 +') + bve (%r2) + std n1,0(remptr) C store remainder + +LDEF(large_divisor) + extrd,u n0,63,1,%r19 C save lsb of dividend + shrpd n1,n0,1,n0 C n0 = lo(n1n0 >> 1) + shrpd %r0,n1,1,n1 C n1 = hi(n1n0 >> 1) + extrd,u d,63,1,%r20 C save lsb of divisor + shrpd %r0,d,1,d C d = floor(orig_d / 2) + add,l %r20,d,d C d = ceil(orig_d / 2) + + sub %r0,d,dn +LDEF(Loop2) + divstep divstep divstep divstep divstep divstep divstep divstep + addib,<> -1,%r31,L(Loop2) + nop + + cmpib,*= 0,%r20,L(even_divisor) + shladd n1,1,%r19,n1 C shift in omitted dividend lsb + + add d,d,d C restore orig... + sub d,%r20,d C ...d value + sub %r0,d,dn C r21 = -d + + add,*nuv n1,q,n1 C fix remainder for omitted divisor lsb + add,l n1,dn,n1 C adjust remainder if rem. fix carried + add,dc %r0,q,q C adjust quotient accordingly + + sub,*<< n1,d,%r0 C remainder >= divisor? + add,l n1,dn,n1 C adjust remainder + add,dc %r0,q,q C adjust quotient + +LDEF(even_divisor) +ifdef(`HAVE_ABI_2_0n', +` copy %r28,%r29 + extrd,u %r28,31,32,%r28 +') + bve (%r2) + std n1,0(remptr) C store remainder +EPILOGUE(mpn_udiv_qrnnd_r) diff --git a/gmp-6.3.0/mpn/pa64/umul.asm b/gmp-6.3.0/mpn/pa64/umul.asm new file mode 100644 index 0000000..bd5a71f --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/umul.asm @@ -0,0 +1,97 @@ +dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Optimizations: +dnl * Avoid skip instructions +dnl * Put carry-generating and carry-consuming insns consecutively +dnl * Don't allocate any stack, "home" positions for parameters could be used. + +include(`../config.m4') + +define(`p0',`%r28') +define(`p1',`%r29') +define(`t32',`%r19') +define(`t0',`%r20') +define(`t1',`%r21') +define(`x',`%r22') +define(`m0',`%r23') +define(`m1',`%r24') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_umul_ppmm_r) + ldo 128(%r30),%r30 +ifdef(`HAVE_ABI_2_0w', +` std %r26,-64(%r30) + std %r25,-56(%r30) + copy %r24,%r31 +',` + depd %r25,31,32,%r26 + std %r26,-64(%r30) + depd %r23,31,32,%r24 + std %r24,-56(%r30) + ldw -180(%r30),%r31 +') + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 C t32 = 2^32 + + ldd -128(%r30),p0 C lo = low 64 bit of product + ldd -120(%r30),m0 C m0 = mid0 64 bit of product + ldd -112(%r30),m1 C m1 = mid1 64 bit of product + ldd -104(%r30),p1 C hi = high 64 bit of product + + add,l,*nuv m0,m1,x C x = m1+m0 + add,l t32,p1,p1 C propagate carry to mid of p1 + depd,z x,31,32,t0 C lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 C hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) C store low half of product +ifdef(`HAVE_ABI_2_0w', +` copy p1,%r28 C return val in %r28 +',` extrd,u p1,31,32,%r28 C return val in %r28,%r29 +') + bve (%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_umul_ppmm_r) -- cgit v1.2.3