From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c | 505 ++++++++++++++++++++++++++++++++++
 1 file changed, 505 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c

(limited to 'gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c')

diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c
new file mode 100644
index 0000000..4977131
--- /dev/null
+++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c
@@ -0,0 +1,505 @@
+/* mpn_div_qr_1n_pi1
+
+   Contributed to the GNU project by Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if GMP_NAIL_BITS > 0
+#error Nail bits not supported
+#endif
+
+#ifndef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 2
+#endif
+
+/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
+
+#if defined (__GNUC__) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %k2\n\t"					\
+	     "adc	%4, %k1\n\t"					\
+	     "sbb	%k0, %k0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %q2\n\t"					\
+	     "adc	%4, %q1\n\t"					\
+	     "sbb	%q0, %q0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxcc	%r3, %4, %1\n\t"				\
+	     "subx	%%g0, %%g0, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addccc	%r7, %8, %%g0\n\t"				\
+	     "addccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
+	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
+	 __CLOBBER_CC)
+#if __VIS__ >= 0x300
+#undef add_mssaaaa
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
+	     "adde	%1, %3, %4\n\t"					\
+	     "subfe	%0, %0, %0\n\t"					\
+	     "nor	%0, %0, %0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)			\
+	     __CLOBBER_CC)
+#endif
+
+#if defined (__s390x__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "algr	%2, %6\n\t"					\
+	     "alcgr	%1, %4\n\t"					\
+	     "lghi	%0, 0\n\t"					\
+	     "alcgr	%0, %0\n\t"					\
+	     "lcgr	%0, %0"						\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
+	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
+#endif
+
+#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "adds	%2, %5, %6\n\t"					\
+	     "adcs	%1, %3, %4\n\t"					\
+	     "movcc	%0, #0\n\t"					\
+	     "movcs	%0, #-1"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "adds	%2, %x5, %6\n\t"				\
+	     "adcs	%1, %x3, %x4\n\t"				\
+	     "csinv	%0, xzr, xzr, cc\n\t"				\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC)
+#endif
+#endif /* defined (__GNUC__) */
+
+#ifndef add_mssaaaa
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (m) = - (__c1 + (__s1 < __c0));					\
+  } while (0)
+#endif
+
+#if DIV_QR_1N_METHOD == 1
+
+/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
+   Requires that uh < d. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  ASSERT (n > 0);
+  ASSERT (uh < d);
+  ASSERT (d & GMP_NUMB_HIGHBIT);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
+
+  do
+    {
+      mp_limb_t q, ul;
+
+      ul = up[--n];
+      udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
+      qp[n] = q;
+    }
+  while (n > 0);
+
+  return uh;
+}
+
+#elif DIV_QR_1N_METHOD == 2
+
+/* The main idea of this algorithm is to write B^2 = d (B + dinv) +
+   B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration
+   can then replace
+
+     u1 B^2 = u1 B2 (mod d)
+
+   which gives a very short critical path for computing the remainder
+   (with some tricks to handle the carry when the next two lower limbs
+   are added in). To also get the quotient, include the corresponding
+   multiple of d in the expression,
+
+     u1 B^2 = u1 B2 + (u1 dinv + u1 B) d
+
+   We get the quotient by accumulating the (u1 dinv + u1 B) terms. The
+   two multiplies, u1 * B2 and u1 * dinv, are independent, and can be
+   executed in parallel.
+ */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t u0, u2;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+  u0 = up[n-1];	/* Early read, to allow qp == up. */
+  qp[n-1] = q1;
+
+  add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+
+      /* Additions for the q update:
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *    +---+---+
+       *    | 1 | v |  (conditional on u2)
+       *    +---+---+
+       *        | 1 |  (conditional on u0 + u2 B2 carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+      */
+      umul_ppmm (p1, t, u1, dinv);
+      ADDC_LIMB (cy, u0, u0, u2 & B2);
+      u0 -= (-cy) & d;
+      add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
+      q0 = t;
+
+      /* Note that p1 + cy cannot overflow */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
+
+      umul_ppmm (p1, p0, u1, B2);
+
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+
+      add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
+    }
+
+  q1 = (u2 > 0);
+  u1 -= (-q1) & d;
+
+  t = (u1 >= d);
+  q1 += t;
+  u1 -= (-t) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+
+#elif DIV_QR_1N_METHOD == 3
+
+/* This variant handles carry from the u update earlier. This gives a
+   longer critical path, but reduces the work needed for the
+   quotients. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t cy, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+  u0 = up[n-1];	/* Early read, to allow qp == up. */
+
+  add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
+  u1 -= cy & d;
+  q1 -= cy;
+  qp[n-1] = q1;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update:
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update:
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *      + |u0 |u-1|
+       *        +---+---+
+       *      - | d |     (conditional on carry)
+       *     ---+---+---+
+       *        |u1 | u0|
+       *        +---+---+
+       *
+      */
+      umul_ppmm (p1, p0, u1, B2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+      add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
+      u1 -= cy & d;
+
+      /* t1 <= B-2, so cy can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+
+#elif DIV_QR_1N_METHOD == 4
+
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t u2, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t B2d0, B2d1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+  /* B2 * (B-d) */
+  umul_ppmm (B2d1, B2d0, B2, -d);
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+
+  add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
+
+  /* After read of up[n-1], to allow qp == up. */
+  qp[n-1] = q1 - u2;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update. *After* u1 -= u2 & d adjustment.
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update. *Before* u1 -= u2 & d adjstment.
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *        |u0 |u-1|
+       *        +---+---+
+       + +      |B2(B-d)| (conditional on u2)
+       *   -+---+---+---+
+       *    |u2 |u1 | u0|
+       *    +---+---+---+
+       *
+      */
+       /* Multiply with unadjusted u1, to shorten critical path. */
+      umul_ppmm (p1, p0, u1, B2);
+      u1 -= (d & u2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+
+      add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
+      add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
+      u2 += cy;
+      ASSERT(-u2 <= 1);
+
+      /* t1 <= B-2, so u2 can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+  u1 -= u2 & d;
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+#else
+#error Unknown DIV_QR_1N_METHOD
+#endif
-- 
cgit v1.2.3