From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/invertappr.c | 300 +++++++++++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/invertappr.c (limited to 'gmp-6.3.0/mpn/generic/invertappr.c') diff --git a/gmp-6.3.0/mpn/generic/invertappr.c b/gmp-6.3.0/mpn/generic/invertappr.c new file mode 100644 index 0000000..3be5596 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/invertappr.c @@ -0,0 +1,300 @@ +/* mpn_invertappr and helper functions. Compute I such that + floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U. + + Contributed to the GNU project by Marco Bodrato. + + The algorithm used here was inspired by ApproximateReciprocal from "Modern + Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special + thanks to Paul Zimmermann for his very valuable suggestions on all the + theoretical aspects during the work on this code. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* FIXME: The iterative version splits the operand in two slightly unbalanced + parts, the use of log_2 (or counting the bits) underestimate the maximum + number of iterations. */ + +#if TUNE_PROGRAM_BUILD +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) +#define MAYBE_dcpi1_divappr 1 +#else +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD)) +#define MAYBE_dcpi1_divappr \ + (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD) +#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \ + (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) +#undef INV_MULMOD_BNM1_THRESHOLD +#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */ +#endif +#endif + +/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take + the strictly normalised value {dp,n} (i.e., most significant bit must be set) + as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}. + + Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the + following conditions are satisfied by the output: + 0 <= e <= 1; + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) . + I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert. + e=1 means that the result _may_ be one less than expected. + + The _bc version returns e=1 most of the time. + The _ni version should return e=0 most of the time; only about 1% of + possible random input should give e=1. + + When the strict result is needed, i.e., e=0 in the relation above: + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ; + the function mpn_invert (ip, dp, n, scratch) should be used instead. */ + +/* Maximum scratch needed by this branch (at xp): 2*n */ +static mp_limb_t +mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n))); + + /* Compute a base value of r limbs. */ + if (n == 1) + invert_limb (*ip, *dp); + else { + /* n > 1 here */ + MPN_FILL (xp, n, GMP_NUMB_MAX); + mpn_com (xp + n, dp, n); + + /* Now xp contains B^2n - {dp,n}*B^n - 1 */ + + /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */ + if (n == 2) { + mpn_divrem_2 (ip, 0, xp, 4, dp); + } else { + gmp_pi1_t inv; + invert_pi1 (inv, dp[n-1], dp[n-2]); + if (! MAYBE_dcpi1_divappr + || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD)) + mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32); + else + mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv); + MPN_DECR_U(ip, n, CNST_LIMB (1)); + return 1; + } + } + return 0; +} + +/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's + iterations (at least one). + + Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer + Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121 + in version 0.4 of the book. + + Some adaptations were introduced, to allow product mod B^m-1 and return the + value e. + + We introduced a correction in such a way that "the value of + B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads + "2B^n-1"). + + Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn + in the scratch, i.e. 3*rn <= 2*n: we require n>4. + + We use a wrapped product modulo B^m-1. NOTE: is there any normalisation + problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| < + B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can + happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h = + B^{n+h} - A, then we get into the "negative" branch, where X_h is not + incremented (because A < B^n). + + FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it + is allocated apart. + */ + +mp_limb_t +mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + mp_limb_t cy; + mp_size_t rn, mn; + mp_size_t sizes[NPOWS], *sizp; + mp_ptr tp; + TMP_DECL; +#define xp scratch + + ASSERT (n > 4); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + /* Compute the computation precisions from highest to lowest, leaving the + base case size in 'rn'. */ + sizp = sizes; + rn = n; + do { + *sizp = rn; + rn = (rn >> 1) + 1; + ++sizp; + } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD)); + + /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */ + dp += n; + ip += n; + + /* Compute a base value of rn limbs. */ + mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch); + + TMP_MARK; + + if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)) + { + mn = mpn_mulmod_bnm1_next_size (n + 1); + tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1)); + } + /* Use Newton's iterations to get the desired precision.*/ + + while (1) { + n = *--sizp; + /* + v n v + +----+--+ + ^ rn ^ + */ + + /* Compute i_jd . */ + if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD) + || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) { + /* FIXME: We do only need {xp,n+1}*/ + mpn_mul (xp, dp - n, n, ip - rn, rn); + mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1); + cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */ + /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */ + } else { /* Use B^mn-1 wraparound */ + mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp); + /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */ + /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */ + /* Add dp*B^rn mod (B^mn-1) */ + ASSERT (n >= mn - rn); + cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn); + cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy); + /* Subtract B^{rn+n}, maybe only compensate the carry*/ + xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */ + MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy); + MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */ + cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */ + } + + if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */ + cy = xp[n]; /* 0 <= cy <= 1 here. */ +#if HAVE_NATIVE_mpn_sublsh1_n + if (cy++) { + if (mpn_cmp (xp, dp - n, n) > 0) { + mp_limb_t chk; + chk = mpn_sublsh1_n (xp, xp, dp - n, n); + ASSERT (chk == xp[n]); + ++ cy; + } else + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + } +#else /* no mpn_sublsh1_n*/ + if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) { + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } +#endif + /* 1 <= cy <= 3 here. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n)); + ++cy; + } else + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#else /* no mpn_rsblsh1_n*/ + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#endif + MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */ + } else { /* "negative" residue class */ + ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1)); + MPN_DECR_U(xp, n + 1, cy); + if (xp[n] != GMP_NUMB_MAX) { + MPN_INCR_U(ip - rn, rn, CNST_LIMB (1)); + ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n)); + } + mpn_com (xp + 2 * n - rn, xp + n - rn, rn); + } + + /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ + mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn); + cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n); + cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy); + MPN_INCR_U (ip - rn, rn, cy); + if (sizp == sizes) { /* Get out of the cycle */ + /* Check for possible carry propagation from below. */ + cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ + /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */ + break; + } + rn = n; + } + TMP_FREE; + + return cy; +#undef xp +} + +mp_limb_t +mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD)) + return mpn_bc_invertappr (ip, dp, n, scratch); + else + return mpn_ni_invertappr (ip, dp, n, scratch); +} -- cgit v1.2.3