1 files changed, 314 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/generic/div_qr_2.c b/gmp-6.3.0/mpn/generic/div_qr_2.c
new file mode 100644
index 0000000..c3c8f57
--- /dev/null
+++ b/gmp-6.3.0/mpn/generic/div_qr_2.c
@@ -0,0 +1,314 @@
+/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   Contributed to the GNU project by Torbjorn Granlund and Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef DIV_QR_2_PI2_THRESHOLD
+/* Disabled unless explicitly tuned. */
+#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
+#endif
+
+#ifndef SANITY_CHECK
+#define SANITY_CHECK 0
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+   * add_csaac accepts two addends and a carry in, and generates a sum and a
+     carry out.  A little like a "full adder".
+*/
+#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((USItype)(s2)),					\
+	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((UDItype)(s2)),					\
+	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#endif /* __GNUC__ */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (s2) += __c1 + (__s1 < __c0);					\
+  } while (0)
+#endif
+
+/* Typically used with r1, r0 same as n3, n2. Other types of overlap
+   between inputs and outputs are not supported. */
+#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
+  do {									\
+    mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
+    mp_limb_t _t1, _t0;							\
+    mp_limb_t _mask;							\
+									\
+    /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
+    umul_ppmm (_q2,_q1, n2, di1);					\
+    umul_ppmm (_q3,_q2a, n3, di1);					\
+    ++_q2;	/* _q2 cannot overflow */				\
+    add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
+    umul_ppmm (_q2c,_q1c, n3, di0);					\
+    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
+    umul_ppmm (_q1d,_q0, n2, di0);					\
+    add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
+    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
+									\
+    umul_ppmm (_t1,_t0, _q2, d0);					\
+    _t1 += _q2 * d1 + _q3 * d0;						\
+									\
+    sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
+									\
+    _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
+    add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
+    sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
+									\
+    if (UNLIKELY (r1 >= d1))						\
+      {									\
+	if (r1 > d1 || r0 >= d0)					\
+	  {								\
+	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
+	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
+	  }								\
+      }									\
+    (q1) = _q3;								\
+    (q0) = _q2;								\
+  } while (0)
+
+static void
+invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
+{
+  mp_limb_t v1, v0, p1, t1, t0, p0, mask;
+  invert_limb (v1, d1);
+  p1 = d1 * v1;
+  /* <1, v1> * d1 = <B-1, p1> */
+  p1 += d0;
+  if (p1 < d0)
+    {
+      v1--;
+      mask = -(mp_limb_t) (p1 >= d1);
+      p1 -= d1;
+      v1 += mask;
+      p1 -= mask & d1;
+    }
+  /* <1, v1> * d1 + d0 = <B-1, p1> */
+  umul_ppmm (t1, p0, d0, v1);
+  p1 += t1;
+  if (p1 < t1)
+    {
+      if (UNLIKELY (p1 >= d1))
+	{
+	  if (p1 > d1 || p0 >= d0)
+	    {
+	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
+	      v1--;
+	    }
+	}
+      sub_ddmmss (p1, p0, p1, p0, d1, d0);
+      v1--;
+    }
+  /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
+   * with <p1, p0> + <d1, d0> >= B^2.
+   *
+   * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
+   * partial remainder after <1, v1> is
+   *
+   * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
+   *                              = <~p1, ~p0, B-1>
+   */
+  udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
+  di[0] = v0;
+  di[1] = v1;
+
+#if SANITY_CHECK
+  {
+    mp_limb_t tp[4];
+    mp_limb_t dp[2];
+    dp[0] = d0;
+    dp[1] = d1;
+    mpn_mul_n (tp, dp, di, 2);
+    ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
+    ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
+    ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
+    ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
+  }
+#endif
+}
+
+static mp_limb_t
+mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
+{
+  mp_limb_t qh;
+  mp_size_t i;
+  mp_limb_t r1, r0;
+
+  ASSERT (nn >= 2);
+  ASSERT (d1 & GMP_NUMB_HIGHBIT);
+
+  r1 = np[nn-1];
+  r0 = np[nn-2];
+
+  qh = 0;
+  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+    {
+#if GMP_NAIL_BITS == 0
+      sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+      r0 = r0 - d0;
+      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+      r0 &= GMP_NUMB_MASK;
+#endif
+      qh = 1;
+    }
+
+  for (i = nn - 2; i >= 2; i -= 2)
+    {
+      mp_limb_t n1, n0, q1, q0;
+      n1 = np[i-1];
+      n0 = np[i-2];
+      udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
+      qp[i-1] = q1;
+      qp[i-2] = q0;
+    }
+
+  if (i > 0)
+    {
+      mp_limb_t q;
+      udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
+      qp[0] = q;
+    }
+  rp[1] = r1;
+  rp[0] = r0;
+
+  return qh;
+}
+
+
+/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
+   significant quotient limbs at qp and the 2 long remainder at np.
+   Return the most significant limb of the quotient.
+
+   Preconditions:
+   1. qp must either not overlap with the other operands at all, or
+      qp >= np + 2 must hold true.  (This means that it's possible to put
+      the quotient in the high part of {np,nn}, right above the remainder.)
+   2. nn >= 2.  */
+
+mp_limb_t
+mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+	      mp_srcptr dp)
+{
+  mp_limb_t d1;
+  mp_limb_t d0;
+  gmp_pi1_t dinv;
+
+  ASSERT (nn >= 2);
+  ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
+  ASSERT_MPN (np, nn);
+  ASSERT_MPN (dp, 2);
+
+  d1 = dp[1]; d0 = dp[0];
+
+  ASSERT (d1 > 0);
+
+  if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
+    {
+      if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
+	{
+	  gmp_pi1_t dinv;
+	  invert_pi1 (dinv, d1, d0);
+	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
+	}
+      else
+	{
+	  mp_limb_t di[2];
+	  invert_4by2 (di, d1, d0);
+	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
+	}
+    }
+  else
+    {
+      int shift;
+      count_leading_zeros (shift, d1);
+      d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
+      d0 <<= shift;
+      invert_pi1 (dinv, d1, d0);
+      return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
+    }
+}