From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/rootrem.c | 515 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/rootrem.c (limited to 'gmp-6.3.0/mpn/generic/rootrem.c') diff --git a/gmp-6.3.0/mpn/generic/rootrem.c b/gmp-6.3.0/mpn/generic/rootrem.c new file mode 100644 index 0000000..a79099e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/rootrem.c @@ -0,0 +1,515 @@ +/* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and + store the truncated integer part at rootp and the remainder at remp. + + Contributed by Paul Zimmermann (algorithm) and + Paul Zimmermann and Torbjorn Granlund (implementation). + Marco Bodrato wrote logbased_root to seed the loop. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES. IT'S + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT'S ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2002, 2005, 2009-2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* FIXME: + This implementation is not optimal when remp == NULL, since the complexity + is M(n), whereas it should be M(n/k) on average. +*/ + +#include /* for NULL */ + +#include "gmp-impl.h" +#include "longlong.h" + +static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, + mp_limb_t, int); + +#define MPN_RSHIFT(rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + mpn_rshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_INCR (rp, up, un); \ + } \ + } while (0) + +#define MPN_LSHIFT(cy,rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + cy = mpn_lshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_DECR (rp, up, un); \ + cy = 0; \ + } \ + } while (0) + + +/* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero. + If remp <> NULL, put in {remp, un} the remainder. + Return the size (in limbs) of the remainder if remp <> NULL, + or a non-zero value iff the remainder is non-zero when remp = NULL. + Assumes: + (a) up[un-1] is not zero + (b) rootp has at least space for ceil(un/k) limbs + (c) remp has at least space for un limbs (in case remp <> NULL) + (d) the operands do not overlap. + + The auxiliary memory usage is 3*un+2 if remp = NULL, + and 2*un+2 if remp <> NULL. FIXME: This is an incorrect comment. +*/ +mp_size_t +mpn_rootrem (mp_ptr rootp, mp_ptr remp, + mp_srcptr up, mp_size_t un, mp_limb_t k) +{ + ASSERT (un > 0); + ASSERT (up[un - 1] != 0); + ASSERT (k > 1); + + if (UNLIKELY (k == 2)) + return mpn_sqrtrem (rootp, remp, up, un); + /* (un-1)/k > 2 <=> un > 3k <=> (un + 2)/3 > k */ + if (remp == NULL && (un + 2) / 3 > k) + /* Pad {up,un} with k zero limbs. This will produce an approximate root + with one more limb, allowing us to compute the exact integral result. */ + { + mp_ptr sp, wp; + mp_size_t rn, sn, wn; + TMP_DECL; + TMP_MARK; + wn = un + k; + sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */ + TMP_ALLOC_LIMBS_2 (wp, wn, /* will contain the padded input */ + sp, sn); /* approximate root of padded input */ + MPN_COPY (wp + k, up, un); + MPN_FILL (wp, k, 0); + rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1); + /* The approximate root S = {sp,sn} is either the correct root of + {sp,sn}, or 1 too large. Thus unless the least significant limb of + S is 0 or 1, we can deduce the root of {up,un} is S truncated by one + limb. (In case sp[0]=1, we can deduce the root, but not decide + whether it is exact or not.) */ + MPN_COPY (rootp, sp + 1, sn - 1); + TMP_FREE; + return rn; + } + else + { + return mpn_rootrem_internal (rootp, remp, up, un, k, 0); + } +} + +#define LOGROOT_USED_BITS 8 +#define LOGROOT_NEEDS_TWO_CORRECTIONS 1 +#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS) +/* Puts in *rootp some bits of the k^nt root of the number + 2^bitn * 1.op ; where op represents the "fractional" bits. + + The returned value is the number of bits of the root minus one; + i.e. an approximation of the root will be + (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1). + + Currently, only LOGROOT_USED_BITS bits of op are used (the implicit + one is not counted). + */ +static unsigned +logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k) +{ + /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */ + static const + unsigned char vlog[] = {1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, + 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39, 40, 42, 43, + 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63, + 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, + 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193, + 194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206, + 207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219, + 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232, + 232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, + 245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255}; + + /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */ + static const + unsigned char vexp[] = {0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11, + 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, + 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, + 36, 37, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 45, 46, 47, 48, + 49, 50, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, + 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, + 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, + 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, + 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, + 175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193, + 194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213, + 214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234, + 235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255}; + mp_bitcnt_t retval; + + if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS)) + { + /* In the unlikely case, we use two divisions and a modulo. */ + retval = bitn / k; + bitn %= k; + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + } + else + { + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + retval = bitn >> LOGROOT_USED_BITS; + bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1; + } + ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS); + *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS) + | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS; + return retval; +} + +/* if approx is non-zero, does not compute the final remainder */ +static mp_size_t +mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, + mp_limb_t k, int approx) +{ + mp_ptr qp, rp, sp, wp, scratch; + mp_size_t qn, rn, sn, wn, nl, bn; + mp_limb_t save, save2, cy, uh; + mp_bitcnt_t unb; /* number of significant bits of {up,un} */ + mp_bitcnt_t xnb; /* number of significant bits of the result */ + mp_bitcnt_t b, kk; + mp_bitcnt_t sizes[GMP_NUMB_BITS + 1]; + int ni; + int perf_pow; + unsigned ulz, snb, c, logk; + TMP_DECL; + + /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */ + uh = up[un - 1]; + count_leading_zeros (ulz, uh); + ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */ + unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz; + /* unb is the (truncated) logarithm of the input U in base 2*/ + + if (unb < k) /* root is 1 */ + { + rootp[0] = 1; + if (remp == NULL) + un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */ + else + { + mpn_sub_1 (remp, up, un, CNST_LIMB (1)); + un -= (remp [un - 1] == 0); /* There should be at most one zero limb, + if we demand u to be normalized */ + } + return un; + } + /* if (unb - k < k/2 + k/16) // root is 2 */ + + if (ulz == GMP_NUMB_BITS) + uh = up[un - 2]; + else + uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz); + ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1); + + xnb = logbased_root (rootp, uh, unb, k); + snb = LOGROOT_RETURNED_BITS - 1; + /* xnb+1 is the number of bits of the root R */ + /* snb+1 is the number of bits of the current approximation S */ + + kk = k * xnb; /* number of truncated bits in the input */ + + /* FIXME: Should we skip the next two loops when xnb <= snb ? */ + for (uh = (k - 1) / 2, logk = 3; (uh >>= 1) != 0; ++logk ) + ; + /* logk = ceil(log(k)/log(2)) + 1 */ + + /* xnb is the number of remaining bits to determine in the kth root */ + for (ni = 0; (sizes[ni] = xnb) > snb; ++ni) + { + /* invariant: here we want xnb+1 total bits for the kth root */ + + /* if c is the new value of xnb, this means that we'll go from a + root of c+1 bits (say s') to a root of xnb+1 bits. + It is proved in the book "Modern Computer Arithmetic" by Brent + and Zimmermann, Chapter 1, that + if s' >= k*beta, then at most one correction is necessary. + Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that + c >= ceil((xnb + log2(k))/2). */ + if (xnb > logk) + xnb = (xnb + logk) / 2; + else + --xnb; /* add just one bit at a time */ + } + + *rootp >>= snb - xnb; + kk -= xnb; + + ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1); + /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with + sizes[i] <= 2 * sizes[i+1]. + Newton iteration will first compute sizes[ni-1] extra bits, + then sizes[ni-2], ..., then sizes[0] = b. */ + + TMP_MARK; + /* qp and wp need enough space to store S'^k where S' is an approximate + root. Since S' can be as large as S+2, the worst case is when S=2 and + S'=4. But then since we know the number of bits of S in advance, S' + can only be 3 at most. Similarly for S=4, then S' can be 6 at most. + So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k + fits in un limbs, the number of extra limbs needed is bounded by + ceil(k*log2(3/2)/GMP_NUMB_BITS). */ + /* THINK: with the use of logbased_root, maybe the constant is + 258/256 instead of 3/2 ? log2(258/256) < 1/89 < 1/64 */ +#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS) + TMP_ALLOC_LIMBS_3 (scratch, un + 1, /* used by mpn_div_q */ + qp, un + EXTRA, /* will contain quotient and remainder + of R/(k*S^(k-1)), and S^k */ + wp, un + EXTRA); /* will contain S^(k-1), k*S^(k-1), + and temporary for mpn_pow_1 */ + + if (remp == NULL) + rp = scratch; /* will contain the remainder */ + else + rp = remp; + sp = rootp; + + sn = 1; /* Initial approximation has one limb */ + + for (b = xnb; ni != 0; --ni) + { + /* 1: loop invariant: + {sp, sn} is the current approximation of the root, which has + exactly 1 + sizes[ni] bits. + {rp, rn} is the current remainder + {wp, wn} = {sp, sn}^(k-1) + kk = number of truncated bits of the input + */ + + /* Since each iteration treats b bits from the root and thus k*b bits + from the input, and we already considered b bits from the input, + we now have to take another (k-1)*b bits from the input. */ + kk -= (k - 1) * b; /* remaining input bits */ + /* {rp, rn} = floor({up, un} / 2^kk) */ + rn = un - kk / GMP_NUMB_BITS; + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS); + rn -= rp[rn - 1] == 0; + + /* 9: current buffers: {sp,sn}, {rp,rn} */ + + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* W <- S^(k-1) for the next iteration, + and S^k = W * S. */ + wn = mpn_pow_1 (wp, sp, sn, k - 1, qp); + mpn_mul (qp, wp, wn, sp, sn); + qn = wn + sn; + qn -= qp[qn - 1] == 0; + + perf_pow = 1; + /* if S^k > floor(U/2^kk), the root approximation was too large */ + if (qn > rn || (qn == rn && (perf_pow=mpn_cmp (qp, rp, rn)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + } + + /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */ + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + ASSERT_ALWAYS (rn >= qn); + + b = sizes[ni - 1] - sizes[ni]; /* number of bits to compute in the + next iteration */ + bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[], after shift */ + + kk = kk - b; + /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */ + nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS); + /* nl = 1 + floor((kk + b - 1) / GMP_NUMB_BITS) + - floor(kk / GMP_NUMB_BITS) + <= 1 + (kk + b - 1) / GMP_NUMB_BITS + - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS + = 2 + (b - 2) / GMP_NUMB_BITS + thus since nl is an integer: + nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */ + + /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* R = R - Q = floor(U/2^kk) - S^k */ + if (perf_pow != 0) + { + mpn_sub (rp, rp, rn, qp, qn); + MPN_NORMALIZE_NOT_ZERO (rp, rn); + + /* first multiply the remainder by 2^b */ + MPN_LSHIFT (cy, rp + bn, rp, rn, b % GMP_NUMB_BITS); + rn = rn + bn; + if (cy != 0) + { + rp[rn] = cy; + rn++; + } + + save = rp[bn]; + /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */ + if (nl - 1 > bn) + save2 = rp[bn + 1]; + } + else + { + rn = bn; + save2 = save = 0; + } + /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* Now insert bits [kk,kk+b-1] from the input U */ + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS); + /* set to zero high bits of rp[bn] */ + rp[bn] &= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)) - 1; + /* restore corresponding bits */ + rp[bn] |= save; + if (nl - 1 > bn) + rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since + they start by bit 0 in rp[0], so they use + at most ceil(b/GMP_NUMB_BITS) limbs */ + /* FIXME: Should we normalise {rp,rn} here ?*/ + + /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* compute {wp, wn} = k * {sp, sn}^(k-1) */ + cy = mpn_mul_1 (wp, wp, wn, k); + wp[wn] = cy; + wn += cy != 0; + + /* 6: current buffers: {sp,sn}, {qp,qn} */ + + /* multiply the root approximation by 2^b */ + MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS); + sn = sn + b / GMP_NUMB_BITS; + if (cy != 0) + { + sp[sn] = cy; + sn++; + } + + save = sp[b / GMP_NUMB_BITS]; + + /* Number of limbs used by b bits, when least significant bit is + aligned to least limb */ + bn = (b - 1) / GMP_NUMB_BITS + 1; + + /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* now divide {rp, rn} by {wp, wn} to get the low part of the root */ + if (UNLIKELY (rn < wn)) + { + MPN_FILL (sp, bn, 0); + } + else + { + qn = rn - wn; /* expected quotient size */ + if (qn <= bn) { /* Divide only if result is not too big. */ + mpn_div_q (qp, rp, rn, wp, wn, scratch); + qn += qp[qn] != 0; + } + + /* 5: current buffers: {sp,sn}, {qp,qn}. + Note: {rp,rn} is not needed any more since we'll compute it from + scratch at the end of the loop. + */ + + /* the quotient should be smaller than 2^b, since the previous + approximation was correctly rounded toward zero */ + if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) && + qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)))) + { + for (qn = 1; qn < bn; ++qn) + sp[qn - 1] = GMP_NUMB_MAX; + sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - 1 - ((b - 1) % GMP_NUMB_BITS)); + } + else + { + /* 7: current buffers: {sp,sn}, {qp,qn} */ + + /* Combine sB and q to form sB + q. */ + MPN_COPY (sp, qp, qn); + MPN_ZERO (sp + qn, bn - qn); + } + } + sp[b / GMP_NUMB_BITS] |= save; + + /* 8: current buffer: {sp,sn} */ + + } + + /* otherwise we have rn > 0, thus the return value is ok */ + if (!approx || sp[0] <= CNST_LIMB (1)) + { + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* Last iteration: we don't need W anymore. */ + /* mpn_pow_1 requires that both qp and wp have enough + space to store the result {sp,sn}^k + 1 limb */ + qn = mpn_pow_1 (qp, sp, sn, k, wp); + + perf_pow = 1; + if (qn > un || (qn == un && (perf_pow=mpn_cmp (qp, up, un)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + }; + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + + rn = perf_pow != 0; + if (rn != 0 && remp != NULL) + { + mpn_sub (remp, up, un, qp, qn); + rn = un; + MPN_NORMALIZE_NOT_ZERO (remp, rn); + } + } + + TMP_FREE; + return rn; +} -- cgit v1.2.3