From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/generic/sec_powm.c | 430 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 430 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/generic/sec_powm.c

(limited to 'gmp-6.3.0/mpn/generic/sec_powm.c')

diff --git a/gmp-6.3.0/mpn/generic/sec_powm.c b/gmp-6.3.0/mpn/generic/sec_powm.c
new file mode 100644
index 0000000..bba11cf
--- /dev/null
+++ b/gmp-6.3.0/mpn/generic/sec_powm.c
@@ -0,0 +1,430 @@
+/* mpn_sec_powm -- Compute R = U^E mod M.  Secure variant, side-channel silent
+   under the assumption that the multiply instruction is side channel silent.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+Copyright 2007-2009, 2011-2014, 2018-2019, 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+  BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
+
+  1. T <- (B^n * U) mod M; convert to REDC form
+
+  2. Compute table U^0, U^1, U^2... of floor(log(E))-dependent size
+
+  3. While there are more bits in E
+       W <- power left-to-right base-k
+
+  The article "Defeating modexp side-channel attacks with data-independent
+  execution traces", https://gmplib.org/~tege/modexp-silent.pdf, has details.
+
+
+  TODO:
+
+   * Make getbits a macro, thereby allowing it to update the index operand.
+     That will simplify the code using getbits.  (Perhaps make getbits' sibling
+     getbit then have similar form, for symmetry.)
+
+   * Choose window size without looping.  (Superoptimize or think(tm).)
+
+   * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between
+     redc_1 and redc_n.  On such systems, we will switch to redc_2 causing
+     slowdown.
+*/
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#undef MPN_REDC_1_SEC
+#if HAVE_NATIVE_mpn_sbpi1_bdiv_r
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm);			\
+    mpn_cnd_sub_n (cy, rp, up + n, mp, n);				\
+  } while (0)
+#else
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_1 (rp, up, mp, n, invm);				\
+    mpn_cnd_sub_n (cy, rp, rp, mp, n);					\
+  } while (0)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
+#undef MPN_REDC_2_SEC
+#define MPN_REDC_2_SEC(rp, up, mp, n, mip)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_2 (rp, up, mp, n, mip);				\
+    mpn_cnd_sub_n (cy, rp, rp, mp, n);					\
+  } while (0)
+#else
+#define MPN_REDC_2_SEC(rp, up, mp, n, mip) /* empty */
+#undef REDC_1_TO_REDC_2_THRESHOLD
+#define REDC_1_TO_REDC_2_THRESHOLD MP_SIZE_T_MAX
+#endif
+
+/* Define our own mpn squaring function.  We do this since we cannot use a
+   native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
+   SQR_TOOM2_THRESHOLD.  This is so because of fixed size stack allocations
+   made inside mpn_sqr_basecase.  */
+
+#if ! HAVE_NATIVE_mpn_sqr_basecase
+/* The limit of the generic code is SQR_TOOM2_THRESHOLD.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_basecase
+#ifdef TUNE_SQR_TOOM2_MAX
+/* We slightly abuse TUNE_SQR_TOOM2_MAX here.  If it is set for an assembly
+   mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly
+   file.  An assembly mpn_sqr_basecase that does not define it should allow
+   any size.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+#endif
+
+#ifdef WANT_FAT_BINARY
+/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from
+   __gmpn_cpuvec.  Perhaps any possible sqr_basecase.asm allow any size, and we
+   limit the use unnecessarily.  We cannot tell, so play it safe.  FIXME.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+#ifndef SQR_BASECASE_LIM
+/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand
+   size.  */
+#define SQR_BASECASE_LIM  MP_SIZE_T_MAX
+#endif
+
+#define mpn_local_sqr(rp,up,n)						\
+  do {									\
+    if (ABOVE_THRESHOLD (n, SQR_BASECASE_THRESHOLD)			\
+	&& BELOW_THRESHOLD (n, SQR_BASECASE_LIM))			\
+      mpn_sqr_basecase (rp, up, n);					\
+    else								\
+      mpn_mul_basecase(rp, up, n, up, n);				\
+  } while (0)
+
+#define getbit(p,bi) \
+  ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1)
+
+/* FIXME: Maybe some things would get simpler if all callers ensure
+   that bi >= nbits. As far as I understand, with the current code bi
+   < nbits can happen only for the final iteration. */
+static inline mp_limb_t
+getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
+{
+  int nbits_in_r;
+  mp_limb_t r;
+  mp_size_t i;
+
+  if (bi < nbits)
+    {
+      return p[0] & (((mp_limb_t) 1 << bi) - 1);
+    }
+  else
+    {
+      bi -= nbits;			/* bit index of low bit to extract */
+      i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
+      bi %= GMP_NUMB_BITS;		/* bit index in low word */
+      r = p[i] >> bi;			/* extract (low) bits */
+      nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
+      if (nbits_in_r < nbits)		/* did we get enough bits? */
+	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
+      return r & (((mp_limb_t ) 1 << nbits) - 1);
+    }
+}
+
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE  2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE  2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
+static inline int
+win_size (mp_bitcnt_t enb)
+{
+  int k;
+  /* Find k, such that x[k-1] < enb <= x[k].
+
+     We require that x[k] >= k, then it follows that enb > x[k-1] >=
+     k-1, which implies k <= enb.
+  */
+  static const mp_bitcnt_t x[] = {POWM_SEC_TABLE,~(mp_bitcnt_t)0};
+  for (k = 0; enb > x[k++]; )
+    ;
+  ASSERT (k <= enb);
+  return k;
+}
+#endif
+
+/* Convert U to REDC form, U_r = B^n * U mod M.
+   Uses scratch space at tp of size 2un + n + 1.  */
+static void
+redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+  MPN_ZERO (tp, n);
+  MPN_COPY (tp + n, up, un);
+
+  mpn_sec_div_r (tp, un + n, mp, n, tp + un + n);
+  MPN_COPY (rp, tp, n);
+}
+
+static mp_limb_t
+sec_binvert_limb (mp_limb_t n)
+{
+  mp_limb_t inv, t;
+  ASSERT ((n & 1) == 1);
+  /* 3 + 2 -> 5 */
+  inv = n + (((n + 1) << 1) & 0x18);
+
+  t = n * inv;
+#if GMP_NUMB_BITS <= 10
+  /* 5 x 2 -> 10 */
+  inv = 2 * inv - inv * t;
+#else /* GMP_NUMB_BITS > 10 */
+  /* 5 x 2 + 2 -> 12 */
+  inv = 2 * inv - inv * t + ((inv<<10)&-(t&(1<<5)));
+#endif /* GMP_NUMB_BITS <= 10 */
+
+  if (GMP_NUMB_BITS > 12)
+    {
+      t = n * inv - 1;
+      if (GMP_NUMB_BITS <= 36)
+	{
+	  /* 12 x 3 -> 36 */
+	  inv += inv * t * (t - 1);
+	}
+      else /* GMP_NUMB_BITS > 36 */
+	{
+	  mp_limb_t t2 = t * t;
+#if GMP_NUMB_BITS <= 60
+	  /* 12 x 5 -> 60 */
+	  inv += inv * (t2 + 1) * (t2 - t);
+#else /* GMP_NUMB_BITS > 60 */
+	  /* 12 x 5 + 4 -> 64 */
+	  inv *= (t2 + 1) * (t2 - t) + 1 - ((t<<48)&-(t&(1<<12)));
+
+	  /* 64 -> 128 -> 256 -> ... */
+	  for (int todo = (GMP_NUMB_BITS - 1) >> 6; todo != 0; todo >>= 1)
+	    inv = 2 * inv - inv * inv * n;
+#endif /* GMP_NUMB_BITS <= 60 */
+	}
+    }
+
+  ASSERT ((inv * n & GMP_NUMB_MASK) == 1);
+  return inv & GMP_NUMB_MASK;
+}
+
+/* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n},
+   where en = ceil (enb / GMP_NUMB_BITS)
+   Requires that {mp, n} is odd (and hence also mp[0] odd).
+   Uses scratch space at tp as defined by mpn_sec_powm_itch.  */
+void
+mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
+	      mp_srcptr ep, mp_bitcnt_t enb,
+	      mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+  mp_limb_t ip[2], *mip;
+  int windowsize, this_windowsize;
+  mp_limb_t expbits;
+  mp_ptr pp, this_pp, ps;
+  long i;
+  int cnd;
+
+  ASSERT (enb > 0);
+  ASSERT (n > 0);
+  /* The code works for bn = 0, but the defined scratch space is 2 limbs
+     greater than we supply, when converting 1 to redc form .  */
+  ASSERT (bn > 0);
+  ASSERT ((mp[0] & 1) != 0);
+
+  windowsize = win_size (enb);
+
+  mip = ip;
+  mip[0] = sec_binvert_limb (mp[0]);
+  if (ABOVE_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+      mp_limb_t t, dummy, mip0 = mip[0];
+
+      umul_ppmm (t, dummy, mip0, mp[0]);
+      ASSERT (dummy == 1);
+      t += mip0 * mp[1]; /* t = (mp * mip0)[1] */
+
+      mip[1] = t * mip0 - 1; /* ~( - t * mip0) */
+    }
+  mip[0] = -mip[0];
+
+  pp = tp;
+  tp += (n << windowsize);	/* put tp after power table */
+
+  /* Compute pp[0] table entry */
+  /* scratch: |   n   | 1 |   n+2    |  */
+  /*          | pp[0] | 1 | redcify  |  */
+  this_pp = pp;
+  this_pp[n] = 1;
+  redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
+  this_pp += n;
+
+  /* Compute pp[1] table entry.  To avoid excessive scratch usage in the
+     degenerate situation where B >> M, we let redcify use scratch space which
+     will later be used by the pp table (element 2 and up).  */
+  /* scratch: |   n   |   n   |  bn + n + 1  |  */
+  /*          | pp[0] | pp[1] |   redcify    |  */
+  redcify (this_pp, bp, bn, mp, n, this_pp + n);
+
+  /* Precompute powers of b and put them in the temporary area at pp.  */
+  /* scratch: |   n   |   n   | ...  |                    |   2n      |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  product  |  */
+  ps = pp + n;		/* initially B^1 */
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+      for (i = (1 << windowsize) - 2; i > 0; i -= 2)
+	{
+	  mpn_local_sqr (tp, ps, n);
+	  ps += n;
+	  this_pp += n;
+	  MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
+
+	  mpn_mul_basecase (tp, this_pp, n, pp + n, n);
+	  this_pp += n;
+	  MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
+	}
+    }
+  else
+    {
+      for (i = (1 << windowsize) - 2; i > 0; i -= 2)
+	{
+	  mpn_local_sqr (tp, ps, n);
+	  ps += n;
+	  this_pp += n;
+	  MPN_REDC_2_SEC (this_pp, tp, mp, n, mip);
+
+	  mpn_mul_basecase (tp, this_pp, n, pp + n, n);
+	  this_pp += n;
+	  MPN_REDC_2_SEC (this_pp, tp, mp, n, mip);
+	}
+    }
+
+  expbits = getbits (ep, enb, windowsize);
+  ASSERT_ALWAYS (enb >= windowsize);
+  enb -= windowsize;
+
+  mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits);
+
+  /* Main exponentiation loop.  */
+  /* scratch: |   n   |   n   | ...  |                    |     3n-4n     |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  loop scratch |  */
+
+#define INNERLOOP							\
+  while (enb != 0)							\
+    {									\
+      expbits = getbits (ep, enb, windowsize);				\
+      this_windowsize = windowsize;					\
+      if (enb < windowsize)						\
+	{								\
+	  this_windowsize -= windowsize - enb;				\
+	  enb = 0;							\
+	}								\
+      else								\
+	enb -= windowsize;						\
+									\
+      do								\
+	{								\
+	  mpn_local_sqr (tp, rp, n);					\
+	  MPN_REDUCE (rp, tp, mp, n, mip);				\
+	  this_windowsize--;						\
+	}								\
+      while (this_windowsize != 0);					\
+									\
+      mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits);	\
+      mpn_mul_basecase (tp, rp, n, tp + 2*n, n);			\
+									\
+      MPN_REDUCE (rp, tp, mp, n, mip);					\
+    }
+
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+#undef MPN_REDUCE
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1_SEC (rp, tp, mp, n, mip[0])
+      INNERLOOP;
+    }
+  else
+    {
+#undef MPN_REDUCE
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2_SEC (rp, tp, mp, n, mip)
+      INNERLOOP;
+    }
+
+  MPN_COPY (tp, rp, n);
+  MPN_ZERO (tp + n, n);
+
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]);
+  else
+    MPN_REDC_2_SEC (rp, tp, mp, n, mip);
+
+  cnd = mpn_sub_n (tp, rp, mp, n);	/* we need just retval */
+  mpn_cnd_sub_n (!cnd, rp, rp, mp, n);
+}
+
+mp_size_t
+mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n)
+{
+  int windowsize;
+  mp_size_t redcify_itch, itch;
+
+  /* FIXME: no more _local/_basecase difference. */
+  /* The top scratch usage will either be when reducing B in the 2nd redcify
+     call, or more typically n*2^windowsize + 3n or 4n, in the main loop.  (It
+     is 3n or 4n depending on if we use mpn_local_sqr or a native
+     mpn_sqr_basecase.  We assume 4n always for now.) */
+
+  windowsize = win_size (enb);
+
+  /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
+     the (bn + n) term is due to redcify's own usage, and the rest is due to
+     mpn_sec_div_r's usage when called from redcify.  */
+  redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2);
+
+  /* The n * 2^windowsize term is due to the power table, the 4n term is due to
+     scratch needs of squaring/multiplication in the exponentiation loop.  */
+  itch = (n << windowsize) + (4 * n);
+
+  return MAX (itch, redcify_itch);
+}
-- 
cgit v1.2.3