From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/pa32/README | 162 ++++++++++++++ gmp-6.3.0/mpn/pa32/add_n.asm | 63 ++++++ gmp-6.3.0/mpn/pa32/gmp-mparam.h | 61 ++++++ gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm | 106 +++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h | 72 ++++++ gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm | 102 +++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm | 83 +++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm | 201 +++++++++++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm | 95 ++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm | 92 ++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm | 84 +++++++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm | 207 ++++++++++++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm | 60 +++++ gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm | 115 ++++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm | 102 +++++++++ gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm | 47 ++++ gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm | 107 +++++++++ gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h | 167 ++++++++++++++ gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm | 112 ++++++++++ gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm | 107 +++++++++ gmp-6.3.0/mpn/pa32/lshift.asm | 75 +++++++ gmp-6.3.0/mpn/pa32/pa-defs.m4 | 64 ++++++ gmp-6.3.0/mpn/pa32/rshift.asm | 72 ++++++ gmp-6.3.0/mpn/pa32/sub_n.asm | 64 ++++++ gmp-6.3.0/mpn/pa32/udiv.asm | 291 +++++++++++++++++++++++++ 25 files changed, 2711 insertions(+) create mode 100644 gmp-6.3.0/mpn/pa32/README create mode 100644 gmp-6.3.0/mpn/pa32/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/pa-defs.m4 create mode 100644 gmp-6.3.0/mpn/pa32/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/udiv.asm (limited to 'gmp-6.3.0/mpn/pa32') diff --git a/gmp-6.3.0/mpn/pa32/README b/gmp-6.3.0/mpn/pa32/README new file mode 100644 index 0000000..4323390 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/README @@ -0,0 +1,162 @@ +Copyright 1996, 1999, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + +This directory contains mpn functions for various HP PA-RISC chips. Code +that runs faster on the PA7100 and later implementations, is in the pa7100 +directory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On the PA7000 no memory instructions can issue the two cycles after a store. +For the PA7100, this is reduced to one cycle. + +The PA7100 has a lookup-free cache, so it helps to schedule loads and the +dependent instruction really far from each other. + +STATUS + +1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the + instructions below (but some sw pipelining is needed to avoid the + xmpyu-fstds delay): + + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + + addc + stws res_ptr + addc + stws res_ptr + + addib Loop + +2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb + (asymptotically) on the PA7100, using the instructions below. With proper + sw pipelining and the unrolling level below, the speed becomes 8 + cycles/limb. + + fldds s1_ptr + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + addc + addc + addc + addc + addc %r0,%r0,cy-limb + + ldws res_ptr + ldws res_ptr + ldws res_ptr + ldws res_ptr + add + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + + addib + +3. For the PA8000 we have to stick to using 32-bit limbs before compiler + support emerges. But we want to use 64-bit operations whenever possible, + in particular for loads and stores. It is possible to handle mpn_add_n + efficiently by rotating (when s1/s2 are aligned), masking+bit field + inserting when (they are not). The speed should double compared to the + code used today. + + + + +LABEL SYNTAX + +The HP-UX assembler takes labels starting in column 0 with no colon, + + L$loop ldws,mb -4(0,%r25),%r22 + +Gas on hppa GNU/Linux however requires a colon, + + L$loop: ldws,mb -4(0,%r25),%r22 + +This is covered by using LDEF() from asm-defs.m4. An alternative would be +to use ".label" which is accepted by both, + + .label L$loop + ldws,mb -4(0,%r25),%r22 + +but that's not as nice to look at, not if you're used to assembler code +having labels in column 0. + + + + +REFERENCES + +Hewlett Packard, "HP Assembler Reference Manual", 9th edition, June 1998, +part number 92432-90012. + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/pa32/add_n.asm b/gmp-6.3.0/mpn/pa32/add_n.asm new file mode 100644 index 0000000..46f3937 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/add_n.asm @@ -0,0 +1,63 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + add %r20,%r19,%r28 C add first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + addc %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/gmp-mparam.h new file mode 100644 index 0000000..377efcb --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/gmp-mparam.h @@ -0,0 +1,61 @@ +/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* These values are for the PA7100 using GCC. */ +/* Generated by tuneup.c, 2000-10-27. */ + +#ifndef MUL_TOOM22_THRESHOLD +#define MUL_TOOM22_THRESHOLD 30 +#endif +#ifndef MUL_TOOM33_THRESHOLD +#define MUL_TOOM33_THRESHOLD 141 +#endif + +#ifndef SQR_TOOM2_THRESHOLD +#define SQR_TOOM2_THRESHOLD 59 +#endif +#ifndef SQR_TOOM3_THRESHOLD +#define SQR_TOOM3_THRESHOLD 177 +#endif + +#ifndef DIV_DC_THRESHOLD +#define DIV_DC_THRESHOLD 108 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 18 +#endif + +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm new file mode 100644 index 0000000..ec2f219 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm @@ -0,0 +1,106 @@ +dnl HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 11 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 10 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h new file mode 100644 index 0000000..1261b24 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h @@ -0,0 +1,72 @@ +/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-07, gcc 2.8 (pa7100/100MHz) */ + +#define MUL_TOOM22_THRESHOLD 30 +#define MUL_TOOM33_THRESHOLD 89 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 101 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 84 +#define POWM_THRESHOLD 166 + +#define HGCD_THRESHOLD 231 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 823 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 5 +#define DIVREM_1_UNNORM_THRESHOLD 11 +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_THRESHOLD 6589 + +#define MUL_FFT_TABLE { 464, 928, 1920, 4608, 14336, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 480 +#define MUL_FFT_THRESHOLD 3328 + +#define SQR_FFT_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 520 +#define SQR_FFT_THRESHOLD 3328 diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm new file mode 100644 index 0000000..6e60c2f --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm @@ -0,0 +1,102 @@ +dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 7 cycles/limb. + +C We could use fldds to read two limbs at a time from the S1 array, and that +C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +C PA7100, respectively. We don't do that since it does not seem worth the +C (alignment) troubles... + +C At least the PA7100 is rumored to be able to deal with cache-misses without +C stalling instruction issue. If this is true, and the cache is actually also +C lockup-free, we should use a deeper software pipeline, and load from S1 very +C early! (The loads and stores to -12(sp) will surely be in the cache.) + +ASM_START() +PROLOGUE(mpn_mul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm new file mode 100644 index 0000000..b96d403 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm @@ -0,0 +1,83 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. Optimized for the PA7100, where is runs at +dnl 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + add %r20,%r19,%r28 C add first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + addc %r20,%r19,%r28 + +LDEF(rest) + addib,= 4,%r23,L(end) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + addc %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm new file mode 100644 index 0000000..fb16100 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm @@ -0,0 +1,201 @@ +dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size_param',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size_param,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + add s0,lo0,s0 + addib,< -1,size_param,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- +LDEF(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size_param,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ +LDEF(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + add s0,lo0,s0 + fstds %fr6,-8(%r31) + addc s1,lo1,s1 + fstds %fr9,0(%r31) + addc s2,lo2,s2 + fstds %fr10,8(%r31) + addc s3,lo3,s3 + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size_param,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- +LDEF(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addc s1,lo1,s1 + stws,ma s1,4(res_ptr) + addc s2,lo2,s2 + stws,ma s2,4(res_ptr) + addc s3,lo3,s3 + stws,ma s3,4(res_ptr) + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +LDEF(few_limbs) + addib,=,n 4,size_param,L(ret) + +LDEF(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addib,<> -1,size_param,L(loop2) + nop + +LDEF(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm new file mode 100644 index 0000000..d65db2a --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm @@ -0,0 +1,95 @@ +dnl HP-PA mpn_lshift -- Shift a number left. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r22,%r29,%r20 + +LDEF(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + vshd %r22,%r29,%r20 + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r22,%r29,%r20 + +LDEF(rest) + addib,= 4,%r24,L(end1) + nop + +LDEF(eloop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r22,%r29,%r20 + +LDEF(end1) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + +LDEF(end2) + stws,mb %r20,-4(0,%r26) + +LDEF(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm new file mode 100644 index 0000000..f7896fc --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm @@ -0,0 +1,92 @@ +dnl HP-PA mpn_rshift -- Shift a number right. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r29,%r22,%r20 + +LDEF(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + vshd %r29,%r22,%r20 + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r29,%r22,%r20 + +LDEF(rest) + addib,= 4,%r24,L(end1) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r29,%r22,%r20 + +LDEF(end1) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + +LDEF(end2) + stws,ma %r20,4(0,%r26) + +LDEF(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm new file mode 100644 index 0000000..df3f6e8 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm @@ -0,0 +1,84 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. Optimized for the PA7100, where +dnl is runs at 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + subb %r20,%r19,%r28 + +LDEF(rest) + addib,= 4,%r23,L(end) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + subb %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm new file mode 100644 index 0000000..5ea08cb --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm @@ -0,0 +1,207 @@ +dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size_param',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size_param,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + addib,< -1,size_param,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- +LDEF(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size_param,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ +LDEF(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + sub s0,lo0,s0 + fstds %fr6,-8(%r31) + subb s1,lo1,s1 + fstds %fr9,0(%r31) + subb s2,lo2,s2 + fstds %fr10,8(%r31) + subb s3,lo3,s3 + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... just invert cy + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size_param,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- +LDEF(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + sub s0,lo0,s0 + stws,ma s0,4(res_ptr) + subb s1,lo1,s1 + stws,ma s1,4(res_ptr) + subb s2,lo2,s2 + stws,ma s2,4(res_ptr) + subb s3,lo3,s3 + stws,ma s3,4(res_ptr) + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... invert cy + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +LDEF(few_limbs) + addib,=,n 4,size_param,L(ret) + +LDEF(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + stws,ma s0,4(res_ptr) + addib,<> -1,size_param,L(loop2) + nop + +LDEF(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm new file mode 100644 index 0000000..1c7a18e --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm @@ -0,0 +1,60 @@ +dnl HP-PA 1.1 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2.5 cycles/limb on PA8x00. +C 2-way unrolling wouldn't help the PA7100; it could however bring times down +C to 2.0 cycles/limb for the PA8x00. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + ldo 4(rp),rp + fldws,ma 4(up),%fr4r + addib,= -1,n,L(exit) + xmpyu %fr4r,%fr4r,%fr5 + +LDEF(loop) + fldws,ma 4(up),%fr4r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr4r,%fr4r,%fr5 + +LDEF(exit) + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws %fr5l,0(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm new file mode 100644 index 0000000..a9b11d2 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm @@ -0,0 +1,115 @@ +dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 11 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +C It seems possible to make this run as fast as mpn_addmul_1, if we use +C sub,>>= %r29,%r19,%r22 +C addi 1,%r28,%r28 +C but that requires reworking the hairy software pipeline... + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm new file mode 100644 index 0000000..626ecd2 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm @@ -0,0 +1,102 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on PA 7000 and later. + +dnl Copyright 1993, 1994, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C This file has caused a lot of trouble, since it demands PIC reference to +C static data, which triggers bugs in gas (at least version 2.7 through +C 2.11.2). When the bug is triggered, many bogus relocs are generated. The +C current solution is to stuff data right into the code, and refer it using +C absolute offsets. Fragile to be sure, but nothing else seems to work. + +ASM_START() +ifdef(`PIC',`', +` RODATA + INT64(0000, 0x43f00000, 0x0) C 2^64 +') + +PROLOGUE(mpn_udiv_qrnnd) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) C n_hi + stws %r24,-12(0,%r30) C n_lo + +ifdef(`PIC', +` bl .+20,%r31 + dep %r0,31,2,%r31 + .word 0x0 C padding for alignment + .word 0x43f00000, 0x0 C 2^64 + ldo 4(%r31),%r31', +` ldil `L'%L(0000),%r31 + ldo R%L(0000)(%r31),%r31') + + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L(1) + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r31),%fr4 + fadd,dbl %fr4,%fr5,%fr5 + +LDEF(1) + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r20 + comib,= 0,%r20,L(2) + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 + +LDEF(2) + bv 0(%r2) + stws %r22,0(0,%r26) + +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm new file mode 100644 index 0000000..18b923c --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm @@ -0,0 +1,47 @@ +dnl Copyright 1999, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + stw %r25,-16(0,%r30) + fldws -16(0,%r30),%fr22R + stw %r24,-16(0,%r30) + fldws -16(0,%r30),%fr22L + xmpyu %fr22R,%fr22L,%fr22 + fstds %fr22,-16(0,%r30) + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + stw %r29,0(0,%r26) + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm new file mode 100644 index 0000000..8d881b8 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm @@ -0,0 +1,107 @@ +dnl HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0 +dnl and store sum in a third limb vector. + +dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_add_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + sub %r26,%r22,%r26 C offset res_ptr + blr %r28,%r0 C branch into loop + add %r0,%r0,%r0 C reset carry + +LDEF(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,0(%r26) + +LDEF(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,4(%r26) + +LDEF(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,8(%r26) + +LDEF(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,12(%r26) + +LDEF(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,16(%r26) + +LDEF(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,20(%r26) + +LDEF(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,24(%r26) + +LDEF(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + bv (%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h new file mode 100644 index 0000000..6016274 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h @@ -0,0 +1,167 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 552 MHz PA8600 (gcc61.fsffrance.org) */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 28 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 36 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 202 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 250 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ + { 7, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \ + { 15, 7}, { 33, 8}, { 23, 9}, { 15, 8}, \ + { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47,10}, \ + { 31, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135, 8}, { 271, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 351, 9}, { 703, 8}, { 1407,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223, 9}, \ + { 895,10}, { 479,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 415,10}, { 831,11}, { 479,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 107 +#define MUL_FFT_THRESHOLD 2112 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 9}, { 15, 8}, \ + { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 7}, { 511, 9}, { 135, 8}, { 271, 9}, \ + { 143,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351, 7}, { 703,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703, 8}, { 1407,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223, 8}, \ + { 1791,10}, { 479, 9}, { 959,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 415,10}, { 831,11}, \ + { 479,10}, { 959,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 109 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 116 +#define MULLO_MUL_N_THRESHOLD 3574 + +#define DC_DIV_QR_THRESHOLD 100 +#define DC_DIVAPPR_Q_THRESHOLD 348 +#define DC_BDIV_QR_THRESHOLD 109 +#define DC_BDIV_Q_THRESHOLD 254 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 276 +#define INV_APPR_THRESHOLD 276 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 78 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 263 +#define MUPI_DIV_QR_THRESHOLD 102 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 100 +#define GCD_DC_THRESHOLD 379 +#define GCDEXT_DC_THRESHOLD 249 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 782 diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm new file mode 100644 index 0000000..c55112f --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm @@ -0,0 +1,112 @@ +dnl HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2 cycles/limb on PA8x00. +C The 2-way unrolling is actually not helping the PA7100. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end1) + ldo 4(rp),rp + + fldws,ma 4(up),%fr6r + addib,= -1,n,L(end2) + xmpyu %fr4r,%fr4r,%fr5 + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end3) + xmpyu %fr6r,%fr6r,%fr7 + + +LDEF(loop) + fldws,ma 4(up),%fr6r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,= -1,n,L(exite) + xmpyu %fr4r,%fr4r,%fr5 + fldws,ma 4(up),%fr4r + fstws %fr7r,-4(rp) + fstws,ma %fr7l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr6r,%fr6r,%fr7 + +LDEF(exito) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) + +LDEF(exite) + fstws %fr7r,-4(rp) + fstws %fr7l,0(rp) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,4(rp) + fstws %fr5l,8(rp) + fstws,mb %fr7r,12(rp) + bv 0(%r2) + fstws %fr7l,4(rp) + +LDEF(end1) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws,ma %fr5l,8(rp) + +LDEF(end2) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + fstws %fr7r,4(rp) + bv 0(%r2) + fstws %fr7l,8(rp) + +LDEF(end3) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm new file mode 100644 index 0000000..47b3163 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm @@ -0,0 +1,107 @@ +dnl HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same +dnl length > 0 and store difference in a third limb vector. + +dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_sub_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + blr %r28,%r0 C branch into loop + sub %r26,%r22,%r26 C offset res_ptr and set carry + +LDEF(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,0(%r26) + +LDEF(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,4(%r26) + +LDEF(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,8(%r26) + +LDEF(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,12(%r26) + +LDEF(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,16(%r26) + +LDEF(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,20(%r26) + +LDEF(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,24(%r26) + +LDEF(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + addc %r0,%r0,%r28 + bv (%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/lshift.asm b/gmp-6.3.0/mpn/pa32/lshift.asm new file mode 100644 index 0000000..5ea497c --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/lshift.asm @@ -0,0 +1,75 @@ +dnl HP-PA mpn_lshift -- Shift a number left. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r22,%r29,%r20 + +LDEF(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r22,%r29,%r20 + +LDEF(0002) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + +LDEF(0003) + stws,mb %r20,-4(0,%r26) + +LDEF(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/pa-defs.m4 b/gmp-6.3.0/mpn/pa32/pa-defs.m4 new file mode 100644 index 0000000..b26e715 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/pa-defs.m4 @@ -0,0 +1,64 @@ +divert(-1) + +dnl m4 macros for HPPA assembler. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl hppa assembler comments are introduced with ";". +dnl +dnl For cooperation with cpp, apparently lines "# 123" set the line number, +dnl and other lines starting with a "#" are ignored. + +changecom(;) + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl These are the same as the basic PROLOGUE_cpu and EPILOGUE_cpu in +dnl mpn/asm-defs.m4, but using .proc / .procend. These are standard and on +dnl an ELF system they do what .type and .size normally do. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + `.code + ALIGN(8) + .export `$1',entry +`$1'LABEL_SUFFIX' + .proc + .callinfo) dnl This is really bogus, but allows us to compile + dnl again on hppa machines. + + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .procend') + +divert diff --git a/gmp-6.3.0/mpn/pa32/rshift.asm b/gmp-6.3.0/mpn/pa32/rshift.asm new file mode 100644 index 0000000..c5eac83 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/rshift.asm @@ -0,0 +1,72 @@ +dnl HP-PA mpn_rshift -- Shift a number right. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r29,%r22,%r20 + +LDEF(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r29,%r22,%r20 + +LDEF(0002) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + +LDEF(0003) + stws,ma %r20,4(0,%r26) + +LDEF(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/sub_n.asm b/gmp-6.3.0/mpn/pa32/sub_n.asm new file mode 100644 index 0000000..9c71655 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/sub_n.asm @@ -0,0 +1,64 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + subb %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/udiv.asm b/gmp-6.3.0/mpn/pa32/udiv.asm new file mode 100644 index 0000000..addbf41 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/udiv.asm @@ -0,0 +1,291 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on pre-PA7000 CPUs. + +dnl Copyright 1993, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C The code size is a bit excessive. We could merge the last two ds;addc +C sequences by simply moving the "bb,< Odd" instruction down. The only +C trouble is the FFFFFFFF code that would need some hacking. + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + comb,< %r23,0,L(largedivisor) + sub %r0,%r23,%r1 C clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + +LDEF(largedivisor) + extru %r24,31,1,%r19 C r19 = n0 & 1 + bb,< %r23,31,L(odd) + extru %r23,30,31,%r22 C r22 = d >> 1 + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + +LDEF(odd) + addib,sv,n 1,%r22,L(FFFFFFFF) C r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +C We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +C This is just a special case of the code above. +C We come here when d == 0xFFFFFFFF +LDEF(FFFFFFFF) + add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 +EPILOGUE() -- cgit v1.2.3