From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c | 236 ++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c (limited to 'gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c') diff --git a/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c new file mode 100644 index 0000000..ea38e3c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c @@ -0,0 +1,236 @@ +/* mpn_div_qr_1u_pi2. + + THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2013, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* ISSUES: + + * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv? + + * Are there any problems with generating n quotient limbs in the q area? It + surely simplifies things. + + * Not yet adequately tested. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +struct precomp_div_1_pi2 +{ + mp_limb_t dip[2]; + mp_limb_t d; + int norm_cnt; +}; + +mp_limb_t +mpn_div_qr_1u_pi2 (mp_ptr qp, + mp_srcptr up, mp_size_t un, + struct precomp_div_1_pi2 *pd) +{ + mp_size_t i; + mp_limb_t r, u2, u1, u0; + mp_limb_t d0, di1, di0; + mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; + mp_limb_t cnd; + int cnt; + + ASSERT (un >= 2); + ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0); + ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); + ASSERT_MPN (up, un); + +#define q3 q3a +#define q2 q2b +#define q1 q1b + + up += un - 3; + cnt = pd->norm_cnt; + r = up[2] >> (GMP_NUMB_BITS - cnt); + d0 = pd->d << cnt; + + qp += un - 2; + + di1 = pd->dip[1]; + di0 = pd->dip[0]; + + for (i = un - 3; i >= 0; i -= 2) + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt)); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + + up -= 2; + qp -= 2; + } + + if ((un & 1) != 0) + { + u2 = r; + u1 = (up[2] << cnt); + + udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); + qp[1] = q3; + } + else + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + } + + return r >> cnt; + +#undef q3 +#undef q2 +#undef q1 +} -- cgit v1.2.3