From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c | 505 ++++++++++++++++++++++++++++++++++ 1 file changed, 505 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c (limited to 'gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c') diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c new file mode 100644 index 0000000..4977131 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c @@ -0,0 +1,505 @@ +/* mpn_div_qr_1n_pi1 + + Contributed to the GNU project by Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nail bits not supported +#endif + +#ifndef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 2 +#endif + +/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */ + +#if defined (__GNUC__) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %k2\n\t" \ + "adc %4, %k1\n\t" \ + "sbb %k0, %k0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %q2\n\t" \ + "adc %4, %q1\n\t" \ + "sbb %q0, %q0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxcc %r3, %4, %1\n\t" \ + "subx %%g0, %%g0, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addccc %r7, %8, %%g0\n\t" \ + "addccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#if __VIS__ >= 0x300 +#undef add_mssaaaa +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add%I6c %2, %5, %6\n\t" \ + "adde %1, %3, %4\n\t" \ + "subfe %0, %0, %0\n\t" \ + "nor %0, %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "algr %2, %6\n\t" \ + "alcgr %1, %4\n\t" \ + "lghi %0, 0\n\t" \ + "alcgr %0, %0\n\t" \ + "lcgr %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) +#endif + +#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %5, %6\n\t" \ + "adcs %1, %3, %4\n\t" \ + "movcc %0, #0\n\t" \ + "movcs %0, #-1" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %x5, %6\n\t" \ + "adcs %1, %x3, %x4\n\t" \ + "csinv %0, xzr, xzr, cc\n\t" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC) +#endif +#endif /* defined (__GNUC__) */ + +#ifndef add_mssaaaa +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (m) = - (__c1 + (__s1 < __c0)); \ + } while (0) +#endif + +#if DIV_QR_1N_METHOD == 1 + +/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}. + Requires that uh < d. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh, + mp_limb_t d, mp_limb_t dinv) +{ + ASSERT (n > 0); + ASSERT (uh < d); + ASSERT (d & GMP_NUMB_HIGHBIT); + ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n)); + + do + { + mp_limb_t q, ul; + + ul = up[--n]; + udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv); + qp[n] = q; + } + while (n > 0); + + return uh; +} + +#elif DIV_QR_1N_METHOD == 2 + +/* The main idea of this algorithm is to write B^2 = d (B + dinv) + + B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration + can then replace + + u1 B^2 = u1 B2 (mod d) + + which gives a very short critical path for computing the remainder + (with some tricks to handle the carry when the next two lower limbs + are added in). To also get the quotient, include the corresponding + multiple of d in the expression, + + u1 B^2 = u1 B2 + (u1 dinv + u1 B) d + + We get the quotient by accumulating the (u1 dinv + u1 B) terms. The + two multiplies, u1 * B2 and u1 * dinv, are independent, and can be + executed in parallel. + */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u0, u2; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + qp[n-1] = q1; + + add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0); + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+---+ + * | 1 | v | (conditional on u2) + * +---+---+ + * | 1 | (conditional on u0 + u2 B2 carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + */ + umul_ppmm (p1, t, u1, dinv); + ADDC_LIMB (cy, u0, u0, u2 & B2); + u0 -= (-cy) & d; + add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1); + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0); + q0 = t; + + /* Note that p1 + cy cannot overflow */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy); + + umul_ppmm (p1, p0, u1, B2); + + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + + add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0); + } + + q1 = (u2 > 0); + u1 -= (-q1) & d; + + t = (u1 >= d); + q1 += t; + u1 -= (-t) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 3 + +/* This variant handles carry from the u update earlier. This gives a + longer critical path, but reduces the work needed for the + quotients. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t cy, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + + add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0); + u1 -= cy & d; + q1 -= cy; + qp[n-1] = q1; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update: + * +-------+ + * |u1 * B2| + * +---+---+ + * + |u0 |u-1| + * +---+---+ + * - | d | (conditional on carry) + * ---+---+---+ + * |u1 | u0| + * +---+---+ + * + */ + umul_ppmm (p1, p0, u1, B2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0); + u1 -= cy & d; + + /* t1 <= B-2, so cy can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 4 + +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u2, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t B2d0, B2d1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + /* B2 * (B-d) */ + umul_ppmm (B2d1, B2d0, B2, -d); + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + + add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0); + + /* After read of up[n-1], to allow qp == up. */ + qp[n-1] = q1 - u2; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update. *After* u1 -= u2 & d adjustment. + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update. *Before* u1 -= u2 & d adjstment. + * +-------+ + * |u1 * B2| + * +---+---+ + * |u0 |u-1| + * +---+---+ + + + |B2(B-d)| (conditional on u2) + * -+---+---+---+ + * |u2 |u1 | u0| + * +---+---+---+ + * + */ + /* Multiply with unadjusted u1, to shorten critical path. */ + umul_ppmm (p1, p0, u1, B2); + u1 -= (d & u2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + + add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0); + add_mssaaaa (u2, u1, u0, u1, u0, p1, p0); + u2 += cy; + ASSERT(-u2 <= 1); + + /* t1 <= B-2, so u2 can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + u1 -= u2 & d; + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} +#else +#error Unknown DIV_QR_1N_METHOD +#endif -- cgit v1.2.3