From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/diveby3.c | 173 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/diveby3.c (limited to 'gmp-6.3.0/mpn/generic/diveby3.c') diff --git a/gmp-6.3.0/mpn/generic/diveby3.c b/gmp-6.3.0/mpn/generic/diveby3.c new file mode 100644 index 0000000..7dee0bc --- /dev/null +++ b/gmp-6.3.0/mpn/generic/diveby3.c @@ -0,0 +1,173 @@ +/* mpn_divexact_by3c -- mpn exact division by 3. + +Copyright 2000-2003, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if DIVEXACT_BY3_METHOD == 0 + +mp_limb_t +mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t r; + r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); + + /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. + We want to return C. We compute the remainder mod 4 and notice that the + inverse of (2^(2k)-1)/3 mod 4 is 1. */ + return r & 3; +} + +#endif + +#if DIVEXACT_BY3_METHOD == 1 + +/* The algorithm here is basically the same as mpn_divexact_1, as described + in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = + borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, + high(divisor*q) can be determined with two comparisons instead of a + multiply. + + The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, + 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of + 2.95.2 at least). + + It will be noted that the new c is formed by adding three values each 0 + or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c + causes a borrow, that leaves a limb value of either 0xFF...FF or + 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or + 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 + respectively, hence a total of no more than 2. + + Alternatives: + + This implementation has each multiply on the dependent chain, due to + "l=s-c". See below for alternative code which avoids that. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t l, q, s; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (c == 0 || c == 1 || c == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + i = 0; + do + { + s = up[i]; + SUBC_LIMB (c, l, s, c); + + q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + rp[i] = q; + + c += (q >= GMP_NUMB_CEIL_MAX_DIV3); + c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); + } + while (++i < un); + + ASSERT (c == 0 || c == 1 || c == 2); + return c; +} + + +#endif + +#if DIVEXACT_BY3_METHOD == 2 + +/* The following alternative code re-arranges the quotient calculation from + (src[i]-c)*inverse to instead + + q = src[i]*inverse - c*inverse + + thereby allowing src[i]*inverse to be scheduled back as far as desired, + making full use of multiplier throughput and leaving just some carry + handing on the dependent chain. + + The carry handling consists of determining the c for the next iteration. + This is the same as described above, namely look for any borrow from + src[i]-c, and at the high of 3*q. + + high(3*q) is done with two comparisons as above (in c2 and c3). The + borrow from src[i]-c is incorporated into those by noting that if there's + a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn + giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is + enough to make high(3*q) come out 1 bigger, as required. + + l = -c*inverse is calculated at the same time as c, since for most chips + it can be more conveniently derived from separate c1/c2/c3 values than + from a combined c equal to 0, 1 or 2. + + The net effect is that with good pipelining this loop should be able to + run at perhaps 4 cycles/limb, depending on available execute resources + etc. + + Usage: + + This code is not used by default, since we really can't rely on the + compiler generating a good software pipeline, nor on such an approach + even being worthwhile on all CPUs. + + Itanium is one chip where this algorithm helps though, see + mpn/ia64/diveby3.asm. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) +{ + mp_limb_t s, sm, cl, q, qx, c2, c3; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (cy == 0 || cy == 1 || cy == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; + + for (i = 0; i < un; i++) + { + s = up[i]; + sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + + q = (cl + sm) & GMP_NUMB_MASK; + rp[i] = q; + qx = q + (s < cy); + + c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; + c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; + + cy = c2 + c3; + cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); + } + + return cy; +} + +#endif -- cgit v1.2.3