From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/div_qr_2.c | 314 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2.c (limited to 'gmp-6.3.0/mpn/generic/div_qr_2.c') diff --git a/gmp-6.3.0/mpn/generic/div_qr_2.c b/gmp-6.3.0/mpn/generic/div_qr_2.c new file mode 100644 index 0000000..c3c8f57 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2.c @@ -0,0 +1,314 @@ +/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef DIV_QR_2_PI2_THRESHOLD +/* Disabled unless explicitly tuned. */ +#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX +#endif + +#ifndef SANITY_CHECK +#define SANITY_CHECK 0 +#endif + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. + * add_csaac accepts two addends and a carry in, and generates a sum and a + carry out. A little like a "full adder". +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +/* Typically used with r1, r0 same as n3, n2. Other types of overlap + between inputs and outputs are not supported. */ +#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ + do { \ + mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ + mp_limb_t _t1, _t0; \ + mp_limb_t _mask; \ + \ + /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \ + umul_ppmm (_q2,_q1, n2, di1); \ + umul_ppmm (_q3,_q2a, n3, di1); \ + ++_q2; /* _q2 cannot overflow */ \ + add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \ + umul_ppmm (_q2c,_q1c, n3, di0); \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \ + umul_ppmm (_q1d,_q0, n2, di0); \ + add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \ + \ + umul_ppmm (_t1,_t0, _q2, d0); \ + _t1 += _q2 * d1 + _q3 * d0; \ + \ + sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ + \ + _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \ + add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ + sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ + \ + if (UNLIKELY (r1 >= d1)) \ + { \ + if (r1 > d1 || r0 >= d0) \ + { \ + sub_ddmmss (r1, r0, r1, r0, d1, d0); \ + add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ + } \ + } \ + (q1) = _q3; \ + (q0) = _q2; \ + } while (0) + +static void +invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) +{ + mp_limb_t v1, v0, p1, t1, t0, p0, mask; + invert_limb (v1, d1); + p1 = d1 * v1; + /* <1, v1> * d1 = */ + p1 += d0; + if (p1 < d0) + { + v1--; + mask = -(mp_limb_t) (p1 >= d1); + p1 -= d1; + v1 += mask; + p1 -= mask & d1; + } + /* <1, v1> * d1 + d0 = */ + umul_ppmm (t1, p0, d0, v1); + p1 += t1; + if (p1 < t1) + { + if (UNLIKELY (p1 >= d1)) + { + if (p1 > d1 || p0 >= d0) + { + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + } + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + /* Now v1 is the 3/2 inverse, <1, v1> * = , + * with + >= B^2. + * + * The 4/2 inverse is (B^4 - 1) / = <1, v1, v0>. The + * partial remainder after <1, v1> is + * + * B^4 - 1 - B <1, v1> = - + * = <~p1, ~p0, B-1> + */ + udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); + di[0] = v0; + di[1] = v1; + +#if SANITY_CHECK + { + mp_limb_t tp[4]; + mp_limb_t dp[2]; + dp[0] = d0; + dp[1] = d1; + mpn_mul_n (tp, dp, di, 2); + ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); + ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); + } +#endif +} + +static mp_limb_t +mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t r1, r0; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + + r1 = np[nn-1]; + r0 = np[nn-2]; + + qh = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + qh = 1; + } + + for (i = nn - 2; i >= 2; i -= 2) + { + mp_limb_t n1, n0, q1, q0; + n1 = np[i-1]; + n0 = np[i-2]; + udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); + qp[i-1] = q1; + qp[i-2] = q0; + } + + if (i > 0) + { + mp_limb_t q; + udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); + qp[0] = q; + } + rp[1] = r1; + rp[0] = r0; + + return qh; +} + + +/* Divide num {np,nn} by den {dp,2} and write the nn-2 least + significant quotient limbs at qp and the 2 long remainder at np. + Return the most significant limb of the quotient. + + Preconditions: + 1. qp must either not overlap with the other operands at all, or + qp >= np + 2 must hold true. (This means that it's possible to put + the quotient in the high part of {np,nn}, right above the remainder.) + 2. nn >= 2. */ + +mp_limb_t +mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_srcptr dp) +{ + mp_limb_t d1; + mp_limb_t d0; + gmp_pi1_t dinv; + + ASSERT (nn >= 2); + ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, 2); + + d1 = dp[1]; d0 = dp[0]; + + ASSERT (d1 > 0); + + if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) + { + if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) + { + gmp_pi1_t dinv; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); + } + else + { + mp_limb_t di[2]; + invert_4by2 (di, d1, d0); + return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); + } + } + else + { + int shift; + count_leading_zeros (shift, d1); + d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); + d0 <<= shift; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); + } +} -- cgit v1.2.3