aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/generic/div_qr_2.c
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/generic/div_qr_2.c')
-rw-r--r--gmp-6.3.0/mpn/generic/div_qr_2.c314
1 files changed, 314 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/generic/div_qr_2.c b/gmp-6.3.0/mpn/generic/div_qr_2.c
new file mode 100644
index 0000000..c3c8f57
--- /dev/null
+++ b/gmp-6.3.0/mpn/generic/div_qr_2.c
@@ -0,0 +1,314 @@
+/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
+ quotient. The divisor is two limbs.
+
+ Contributed to the GNU project by Torbjorn Granlund and Niels Möller
+
+ THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
+ SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
+ GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef DIV_QR_2_PI2_THRESHOLD
+/* Disabled unless explicitly tuned. */
+#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
+#endif
+
+#ifndef SANITY_CHECK
+#define SANITY_CHECK 0
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+ * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+ an additional sum operand.
+ * add_csaac accepts two addends and a carry in, and generates a sum and a
+ carry out. A little like a "full adder".
+*/
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((USItype)(s2)), \
+ "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
+ "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "0" ((UDItype)(s2)), \
+ "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
+ "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \
+ __CLOBBER_CC)
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+ processor running in 32-bit mode, since the carry flag then gets the 32-bit
+ carry. */
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \
+ : "=r" (s2), "=&r" (s1), "=&r" (s0) \
+ : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
+ __CLOBBER_CC)
+#endif
+
+#endif /* __GNUC__ */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
+ do { \
+ UWtype __s0, __s1, __c0, __c1; \
+ __s0 = (a0) + (b0); \
+ __s1 = (a1) + (b1); \
+ __c0 = __s0 < (a0); \
+ __c1 = __s1 < (a1); \
+ (s0) = __s0; \
+ __s1 = __s1 + __c0; \
+ (s1) = __s1; \
+ (s2) += __c1 + (__s1 < __c0); \
+ } while (0)
+#endif
+
+/* Typically used with r1, r0 same as n3, n2. Other types of overlap
+ between inputs and outputs are not supported. */
+#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
+ do { \
+ mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
+ mp_limb_t _t1, _t0; \
+ mp_limb_t _mask; \
+ \
+ /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \
+ umul_ppmm (_q2,_q1, n2, di1); \
+ umul_ppmm (_q3,_q2a, n3, di1); \
+ ++_q2; /* _q2 cannot overflow */ \
+ add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \
+ umul_ppmm (_q2c,_q1c, n3, di0); \
+ add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \
+ umul_ppmm (_q1d,_q0, n2, di0); \
+ add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
+ add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \
+ \
+ umul_ppmm (_t1,_t0, _q2, d0); \
+ _t1 += _q2 * d1 + _q3 * d0; \
+ \
+ sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
+ \
+ _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \
+ add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
+ sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
+ \
+ if (UNLIKELY (r1 >= d1)) \
+ { \
+ if (r1 > d1 || r0 >= d0) \
+ { \
+ sub_ddmmss (r1, r0, r1, r0, d1, d0); \
+ add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
+ } \
+ } \
+ (q1) = _q3; \
+ (q0) = _q2; \
+ } while (0)
+
+static void
+invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
+{
+ mp_limb_t v1, v0, p1, t1, t0, p0, mask;
+ invert_limb (v1, d1);
+ p1 = d1 * v1;
+ /* <1, v1> * d1 = <B-1, p1> */
+ p1 += d0;
+ if (p1 < d0)
+ {
+ v1--;
+ mask = -(mp_limb_t) (p1 >= d1);
+ p1 -= d1;
+ v1 += mask;
+ p1 -= mask & d1;
+ }
+ /* <1, v1> * d1 + d0 = <B-1, p1> */
+ umul_ppmm (t1, p0, d0, v1);
+ p1 += t1;
+ if (p1 < t1)
+ {
+ if (UNLIKELY (p1 >= d1))
+ {
+ if (p1 > d1 || p0 >= d0)
+ {
+ sub_ddmmss (p1, p0, p1, p0, d1, d0);
+ v1--;
+ }
+ }
+ sub_ddmmss (p1, p0, p1, p0, d1, d0);
+ v1--;
+ }
+ /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
+ * with <p1, p0> + <d1, d0> >= B^2.
+ *
+ * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
+ * partial remainder after <1, v1> is
+ *
+ * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
+ * = <~p1, ~p0, B-1>
+ */
+ udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
+ di[0] = v0;
+ di[1] = v1;
+
+#if SANITY_CHECK
+ {
+ mp_limb_t tp[4];
+ mp_limb_t dp[2];
+ dp[0] = d0;
+ dp[1] = d1;
+ mpn_mul_n (tp, dp, di, 2);
+ ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
+ ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
+ ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
+ ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
+ }
+#endif
+}
+
+static mp_limb_t
+mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+ mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
+{
+ mp_limb_t qh;
+ mp_size_t i;
+ mp_limb_t r1, r0;
+
+ ASSERT (nn >= 2);
+ ASSERT (d1 & GMP_NUMB_HIGHBIT);
+
+ r1 = np[nn-1];
+ r0 = np[nn-2];
+
+ qh = 0;
+ if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+ {
+#if GMP_NAIL_BITS == 0
+ sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+ r0 = r0 - d0;
+ r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+ r0 &= GMP_NUMB_MASK;
+#endif
+ qh = 1;
+ }
+
+ for (i = nn - 2; i >= 2; i -= 2)
+ {
+ mp_limb_t n1, n0, q1, q0;
+ n1 = np[i-1];
+ n0 = np[i-2];
+ udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
+ qp[i-1] = q1;
+ qp[i-2] = q0;
+ }
+
+ if (i > 0)
+ {
+ mp_limb_t q;
+ udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
+ qp[0] = q;
+ }
+ rp[1] = r1;
+ rp[0] = r0;
+
+ return qh;
+}
+
+
+/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
+ significant quotient limbs at qp and the 2 long remainder at np.
+ Return the most significant limb of the quotient.
+
+ Preconditions:
+ 1. qp must either not overlap with the other operands at all, or
+ qp >= np + 2 must hold true. (This means that it's possible to put
+ the quotient in the high part of {np,nn}, right above the remainder.)
+ 2. nn >= 2. */
+
+mp_limb_t
+mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+ mp_srcptr dp)
+{
+ mp_limb_t d1;
+ mp_limb_t d0;
+ gmp_pi1_t dinv;
+
+ ASSERT (nn >= 2);
+ ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
+ ASSERT_MPN (np, nn);
+ ASSERT_MPN (dp, 2);
+
+ d1 = dp[1]; d0 = dp[0];
+
+ ASSERT (d1 > 0);
+
+ if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
+ {
+ if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
+ {
+ gmp_pi1_t dinv;
+ invert_pi1 (dinv, d1, d0);
+ return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
+ }
+ else
+ {
+ mp_limb_t di[2];
+ invert_4by2 (di, d1, d0);
+ return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
+ }
+ }
+ else
+ {
+ int shift;
+ count_leading_zeros (shift, d1);
+ d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
+ d0 <<= shift;
+ invert_pi1 (dinv, d1, d0);
+ return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
+ }
+}