/src/gmp-6.2.1/mpn/powm.c

Source (jump to first uncovered line)
/* mpn_powm -- Compute R = U^E mod M.

   Contributed to the GNU project by Torbjorn Granlund.

   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

Copyright 2007-2012, 2019 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


/*
  BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.

  1. W <- U

  2. T <- (B^n * U) mod M                Convert to REDC form

  3. Compute table U^1, U^3, U^5... of E-dependent size

  4. While there are more bits in E
       W <- power left-to-right base-k


  TODO:

   * Make getbits a macro, thereby allowing it to update the index operand.
     That will simplify the code using getbits.  (Perhaps make getbits' sibling
     getbit then have similar form, for symmetry.)

   * Write an itch function.  Or perhaps get rid of tp parameter since the huge
     pp area is allocated locally anyway?

   * Choose window size without looping.  (Superoptimize or think(tm).)

   * Handle small bases with initial, reduction-free exponentiation.

   * Call new division functions, not mpn_tdiv_qr.

   * Consider special code for one-limb M.

   * How should we handle the redc1/redc2/redc_n choice?
     - redc1:  T(binvert_1limb)  + e * (n)   * (T(mullo-1x1) + n*T(addmul_1))
     - redc2:  T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
     - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
     This disregards the addmul_N constant term, but we could think of
     that as part of the respective mullo.

   * When U (the base) is small, we should start the exponentiation with plain
     operations, then convert that partial result to REDC form.

   * When U is just one limb, should it be handled without the k-ary tricks?
     We could keep a factor of B^n in W, but use U' = BU as base.  After
     multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
     mod M.
*/

#include "gmp-impl.h"
#include "longlong.h"

#undef MPN_REDC_0
#define MPN_REDC_0(rp, up, mp, invm)          \
  do {                 \
    mp_limb_t p1, r0, u0, _dummy;         \
    u0 = *(up);               \
    umul_ppmm (p1, _dummy, *(mp), (u0 * (invm)) & GMP_NUMB_MASK);  \
    ASSERT (((u0 + _dummy) & GMP_NUMB_MASK) == 0);      \
    p1 += (u0 != 0);              \
    r0 = (up)[1] + p1;              \
    if (p1 > r0)             \
      r0 -= *(mp);             \
    *(rp) = r0;               \
  } while (0)

#undef MPN_REDC_1
#if HAVE_NATIVE_mpn_sbpi1_bdiv_r
#define MPN_REDC_1(rp, up, mp, n, invm)         \
  do {                  \
    mp_limb_t cy;             \
    cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm);     \
    if (cy != 0)              \
      mpn_sub_n (rp, up + n, mp, n);          \
    else                \
      MPN_COPY (rp, up + n, n);           \
  } while (0)
#else
#define MPN_REDC_1(rp, up, mp, n, invm)         \
  do {                 \
    mp_limb_t cy;             \
    cy = mpn_redc_1 (rp, up, mp, n, invm);       \
    if (cy != 0)             \
      mpn_sub_n (rp, rp, mp, n);         \
  } while (0)
#endif

#undef MPN_REDC_2
#define MPN_REDC_2(rp, up, mp, n, mip)          \
  do {                 \
    mp_limb_t cy;             \
    cy = mpn_redc_2 (rp, up, mp, n, mip);        \
    if (cy != 0)             \
      mpn_sub_n (rp, rp, mp, n);         \
  } while (0)

#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
#define WANT_REDC_2 1
#endif

#define getbit(p,bi) \
  ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)

static inline mp_limb_t
getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
{
  int nbits_in_r;
  mp_limb_t r;
  mp_size_t i;

  if (bi < nbits)
    {
      return p[0] & (((mp_limb_t) 1 << bi) - 1);
    }
  else
    {
      bi -= nbits;      /* bit index of low bit to extract */
      i = bi / GMP_NUMB_BITS;   /* word index of low bit to extract */
      bi %= GMP_NUMB_BITS;   /* bit index in low word */
      r = p[i] >> bi;     /* extract (low) bits */
      nbits_in_r = GMP_NUMB_BITS - bi;  /* number of bits now in r */
      if (nbits_in_r < nbits)   /* did we get enough bits? */
  r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
      return r & (((mp_limb_t) 1 << nbits) - 1);
    }
}

static inline int
win_size (mp_bitcnt_t eb)
{
  int k;
  static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
  for (k = 1; eb > x[k]; k++)
    ;
  return k;
}

/* Convert U to REDC form, U_r = B^n * U mod M */
static void
redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
{
  mp_ptr tp, qp;
  TMP_DECL;
  TMP_MARK;

  TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1);

  MPN_ZERO (tp, n);
  MPN_COPY (tp + n, up, un);
  mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
  TMP_FREE;
}

/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
   Requires that mp[n-1..0] is odd.
   Requires that ep[en-1..0] is > 1.
   Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs.  */
void
mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
    mp_srcptr ep, mp_size_t en,
    mp_srcptr mp, mp_size_t n, mp_ptr tp)
{
  mp_limb_t ip[2], *mip;
  int cnt;
  mp_bitcnt_t ebi;
  int windowsize, this_windowsize;
  mp_limb_t expbits;
  mp_ptr pp, this_pp;
  long i;
  TMP_DECL;

  ASSERT (en > 1 || (en == 1 && ep[0] > 1));
  ASSERT (n >= 1 && ((mp[0] & 1) != 0));

  TMP_MARK;

  MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);

#if 0
  if (bn < n)
    {
      /* Do the first few exponent bits without mod reductions,
   until the result is greater than the mod argument.  */
      for (;;)
  {
    mpn_sqr (tp, this_pp, tn);
    tn = tn * 2 - 1,  tn += tp[tn] != 0;
    if (getbit (ep, ebi) != 0)
      mpn_mul (..., tp, tn, bp, bn);
    ebi--;
  }
    }
#endif

  windowsize = win_size (ebi);

#if WANT_REDC_2
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
    {
      mip = ip;
      binvert_limb (mip[0], mp[0]);
      mip[0] = -mip[0];
    }
  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
    {
      mip = ip;
      mpn_binvert (mip, mp, 2, tp);
      mip[0] = -mip[0]; mip[1] = ~mip[1];
    }
#else
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
    {
      mip = ip;
      binvert_limb (mip[0], mp[0]);
      mip[0] = -mip[0];
    }
#endif
  else
    {
      mip = TMP_ALLOC_LIMBS (n);
      mpn_binvert (mip, mp, n, tp);
    }

  pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));

  this_pp = pp;
  redcify (this_pp, bp, bn, mp, n);

  /* Store b^2 at rp.  */
  mpn_sqr (tp, this_pp, n);
#if 0
  if (n == 1) {
    MPN_REDC_0 (rp, tp, mp, mip[0]);
  } else
#endif
#if WANT_REDC_2
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
    MPN_REDC_2 (rp, tp, mp, n, mip);
#else
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
  else
    mpn_redc_n (rp, tp, mp, n, mip);

  /* Precompute odd powers of b and put them in the temporary area at pp.  */
  for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
#if 1
    if (n == 1) {
      umul_ppmm((tp)[1], *(tp), *(this_pp), *(rp));
      ++this_pp ;
      MPN_REDC_0 (this_pp, tp, mp, mip[0]);
    } else
#endif
    {
      mpn_mul_n (tp, this_pp, rp, n);
      this_pp += n;
#if WANT_REDC_2
      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
  MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
  MPN_REDC_2 (this_pp, tp, mp, n, mip);
#else
      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
  MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
#endif
      else
  mpn_redc_n (this_pp, tp, mp, n, mip);
    }

  expbits = getbits (ep, ebi, windowsize);
  if (ebi < windowsize)
    ebi = 0;
  else
    ebi -= windowsize;

  count_trailing_zeros (cnt, expbits);
  ebi += cnt;
  expbits >>= cnt;

  MPN_COPY (rp, pp + n * (expbits >> 1), n);

#define INNERLOOP             \
  while (ebi != 0)             \
    {                 \
      while (getbit (ep, ebi) == 0)         \
  {               \
    MPN_SQR (tp, rp, n);           \
    MPN_REDUCE (rp, tp, mp, n, mip);        \
    if (--ebi == 0)           \
      goto done;             \
  }               \
                  \
      /* The next bit of the exponent is 1.  Now extract the largest  \
   block of bits <= windowsize, and such that the least   \
   significant bit is 1.  */          \
                  \
      expbits = getbits (ep, ebi, windowsize);        \
      this_windowsize = windowsize;         \
      if (ebi < windowsize)           \
  {               \
    this_windowsize -= windowsize - ebi;        \
    ebi = 0;              \
  }                \
      else                \
        ebi -= windowsize;           \
                  \
      count_trailing_zeros (cnt, expbits);        \
      this_windowsize -= cnt;           \
      ebi += cnt;             \
      expbits >>= cnt;              \
                  \
      do                \
  {               \
    MPN_SQR (tp, rp, n);           \
    MPN_REDUCE (rp, tp, mp, n, mip);        \
  }               \
      while (--this_windowsize != 0);          \
                  \
      MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n);      \
      MPN_REDUCE (rp, tp, mp, n, mip);          \
    }


  if (n == 1)
    {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    umul_ppmm((r)[1], *(r), *(a), *(b))
#define MPN_SQR(r,a,n)      umul_ppmm((r)[1], *(r), *(a), *(a))
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_0(rp, tp, mp, mip[0])
      INNERLOOP;
    }
  else
#if WANT_REDC_2
  if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
    {
      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
  {
    if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
  }
      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
  {
    if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_2 (rp, tp, mp, n, mip)
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_2 (rp, tp, mp, n, mip)
        INNERLOOP;
      }
  }
      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_2 (rp, tp, mp, n, mip)
    INNERLOOP;
  }
      else
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
    INNERLOOP;
  }
    }
  else
    {
      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
  {
    if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
  }
      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
    INNERLOOP;
  }
      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_2 (rp, tp, mp, n, mip)
    INNERLOOP;
  }
      else
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
    INNERLOOP;
  }
    }

#else  /* WANT_REDC_2 */

  if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
    {
      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
  {
    if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
  }
      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
  {
    if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
        INNERLOOP;
      }
  }
      else
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
    INNERLOOP;
  }
    }
  else
    {
      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
  {
    if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
        || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_mul_basecase (r,a,n,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
    else
      {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_basecase (r,a,n,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr_basecase (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
        INNERLOOP;
      }
  }
      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  MPN_REDC_1 (rp, tp, mp, n, mip[0])
    INNERLOOP;
  }
      else
  {
#undef MPN_MUL_N
#undef MPN_SQR
#undef MPN_REDUCE
#define MPN_MUL_N(r,a,b,n)    mpn_mul_n (r,a,b,n)
#define MPN_SQR(r,a,n)      mpn_sqr (r,a,n)
#define MPN_REDUCE(rp,tp,mp,n,mip)  mpn_redc_n (rp, tp, mp, n, mip)
    INNERLOOP;
  }
    }
#endif  /* WANT_REDC_2 */

 done:

  MPN_COPY (tp, rp, n);
  MPN_ZERO (tp + n, n);

#if WANT_REDC_2
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
    MPN_REDC_2 (rp, tp, mp, n, mip);
#else
  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
#endif
  else
    mpn_redc_n (rp, tp, mp, n, mip);

  if (mpn_cmp (rp, mp, n) >= 0)
    mpn_sub_n (rp, rp, mp, n);

  TMP_FREE;
}

Coverage Report

Created: 2023-02-22 06:39

Line	Count	Source (jump to first uncovered line)
1		/* mpn_powm -- Compute R = U^E mod M.
2
3		Contributed to the GNU project by Torbjorn Granlund.
4
5		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
6		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7		GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9		Copyright 2007-2012, 2019 Free Software Foundation, Inc.
10
11		This file is part of the GNU MP Library.
12
13		The GNU MP Library is free software; you can redistribute it and/or modify
14		it under the terms of either:
15
16		* the GNU Lesser General Public License as published by the Free
17		Software Foundation; either version 3 of the License, or (at your
18		option) any later version.
19
20		or
21
22		* the GNU General Public License as published by the Free Software
23		Foundation; either version 2 of the License, or (at your option) any
24		later version.
25
26		or both in parallel, as here.
27
28		The GNU MP Library is distributed in the hope that it will be useful, but
29		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31		for more details.
32
33		You should have received copies of the GNU General Public License and the
34		GNU Lesser General Public License along with the GNU MP Library. If not,
35		see https://www.gnu.org/licenses/. */
36
37
38		/*
39		BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
40
41		1. W <- U
42
43		2. T <- (B^n * U) mod M Convert to REDC form
44
45		3. Compute table U^1, U^3, U^5... of E-dependent size
46
47		4. While there are more bits in E
48		W <- power left-to-right base-k
49
50
51		TODO:
52
53		* Make getbits a macro, thereby allowing it to update the index operand.
54		That will simplify the code using getbits. (Perhaps make getbits' sibling
55		getbit then have similar form, for symmetry.)
56
57		* Write an itch function. Or perhaps get rid of tp parameter since the huge
58		pp area is allocated locally anyway?
59
60		* Choose window size without looping. (Superoptimize or think(tm).)
61
62		* Handle small bases with initial, reduction-free exponentiation.
63
64		* Call new division functions, not mpn_tdiv_qr.
65
66		* Consider special code for one-limb M.
67
68		* How should we handle the redc1/redc2/redc_n choice?
69		- redc1: T(binvert_1limb) + e * (n) * (T(mullo-1x1) + n*T(addmul_1))
70		- redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
71		- redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
72		This disregards the addmul_N constant term, but we could think of
73		that as part of the respective mullo.
74
75		* When U (the base) is small, we should start the exponentiation with plain
76		operations, then convert that partial result to REDC form.
77
78		* When U is just one limb, should it be handled without the k-ary tricks?
79		We could keep a factor of B^n in W, but use U' = BU as base. After
80		multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
81		mod M.
82		*/
83
84		#include "gmp-impl.h"
85		#include "longlong.h"
86
87		#undef MPN_REDC_0
88		#define MPN_REDC_0(rp, up, mp, invm) \
89	0	do { \
90	0	mp_limb_t p1, r0, u0, _dummy; \
91	0	u0 = *(up); \
92	0	umul_ppmm (p1, _dummy, (mp), (u0 (invm)) & GMP_NUMB_MASK); \
93	0	ASSERT (((u0 + _dummy) & GMP_NUMB_MASK) == 0); \
94	0	p1 += (u0 != 0); \
95	0	r0 = (up)[1] + p1; \
96	0	if (p1 > r0) \
97	0	r0 -= *(mp); \
98	0	*(rp) = r0; \
99	0	} while (0)
100
101		#undef MPN_REDC_1
102		#if HAVE_NATIVE_mpn_sbpi1_bdiv_r
103		#define MPN_REDC_1(rp, up, mp, n, invm) \
104		do { \
105		mp_limb_t cy; \
106		cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \
107		if (cy != 0) \
108		mpn_sub_n (rp, up + n, mp, n); \
109		else \
110		MPN_COPY (rp, up + n, n); \
111		} while (0)
112		#else
113		#define MPN_REDC_1(rp, up, mp, n, invm) \
114	0	do { \
115	0	mp_limb_t cy; \
116	0	cy = mpn_redc_1 (rp, up, mp, n, invm); \
117	0	if (cy != 0) \
118	0	mpn_sub_n (rp, rp, mp, n); \
119	0	} while (0)
120		#endif
121
122		#undef MPN_REDC_2
123		#define MPN_REDC_2(rp, up, mp, n, mip) \
124	0	do { \
125	0	mp_limb_t cy; \
126	0	cy = mpn_redc_2 (rp, up, mp, n, mip); \
127	0	if (cy != 0) \
128	0	mpn_sub_n (rp, rp, mp, n); \
129	0	} while (0)
130
131		#if HAVE_NATIVE_mpn_addmul_2 \|\| HAVE_NATIVE_mpn_redc_2
132		#define WANT_REDC_2 1
133		#endif
134
135		#define getbit(p,bi) \
136	0	((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
137
138		static inline mp_limb_t
139		getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
140	0	{
141	0	int nbits_in_r;
142	0	mp_limb_t r;
143	0	mp_size_t i;
144
145	0	if (bi < nbits)
146	0	{
147	0	return p[0] & (((mp_limb_t) 1 << bi) - 1);
148	0	}
149	0	else
150	0	{
151	0	bi -= nbits; /* bit index of low bit to extract */
152	0	i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */
153	0	bi %= GMP_NUMB_BITS; /* bit index in low word */
154	0	r = p[i] >> bi; /* extract (low) bits */
155	0	nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */
156	0	if (nbits_in_r < nbits) /* did we get enough bits? */
157	0	r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */
158	0	return r & (((mp_limb_t) 1 << nbits) - 1);
159	0	}
160	0	}
161
162		static inline int
163		win_size (mp_bitcnt_t eb)
164	0	{
165	0	int k;
166	0	static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
167	0	for (k = 1; eb > x[k]; k++)
168	0	;
169	0	return k;
170	0	}
171
172		/* Convert U to REDC form, U_r = B^n * U mod M */
173		static void
174		redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
175	0	{
176	0	mp_ptr tp, qp;
177	0	TMP_DECL;
178	0	TMP_MARK;
179
180	0	TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1);
181
182	0	MPN_ZERO (tp, n);
183	0	MPN_COPY (tp + n, up, un);
184	0	mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
185	0	TMP_FREE;
186	0	}
187
188		/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
189		Requires that mp[n-1..0] is odd.
190		Requires that ep[en-1..0] is > 1.
191		Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */
192		void
193		mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
194		mp_srcptr ep, mp_size_t en,
195		mp_srcptr mp, mp_size_t n, mp_ptr tp)
196	0	{
197	0	mp_limb_t ip[2], *mip;
198	0	int cnt;
199	0	mp_bitcnt_t ebi;
200	0	int windowsize, this_windowsize;
201	0	mp_limb_t expbits;
202	0	mp_ptr pp, this_pp;
203	0	long i;
204	0	TMP_DECL;
205
206	0	ASSERT (en > 1 \|\| (en == 1 && ep[0] > 1));
207	0	ASSERT (n >= 1 && ((mp[0] & 1) != 0));
208
209	0	TMP_MARK;
210
211	0	MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
212
213		#if 0
214		if (bn < n)
215		{
216		/* Do the first few exponent bits without mod reductions,
217		until the result is greater than the mod argument. */
218		for (;;)
219		{
220		mpn_sqr (tp, this_pp, tn);
221		tn = tn * 2 - 1, tn += tp[tn] != 0;
222		if (getbit (ep, ebi) != 0)
223		mpn_mul (..., tp, tn, bp, bn);
224		ebi--;
225		}
226		}
227		#endif
228
229	0	windowsize = win_size (ebi);
230
231	0	#if WANT_REDC_2
232	0	if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
233	0	{
234	0	mip = ip;
235	0	binvert_limb (mip[0], mp[0]);
236	0	mip[0] = -mip[0];
237	0	}
238	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
239	0	{
240	0	mip = ip;
241	0	mpn_binvert (mip, mp, 2, tp);
242	0	mip[0] = -mip[0]; mip[1] = ~mip[1];
243	0	}
244		#else
245		if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
246		{
247		mip = ip;
248		binvert_limb (mip[0], mp[0]);
249		mip[0] = -mip[0];
250		}
251		#endif
252	0	else
253	0	{
254	0	mip = TMP_ALLOC_LIMBS (n);
255	0	mpn_binvert (mip, mp, n, tp);
256	0	}
257
258	0	pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));
259
260	0	this_pp = pp;
261	0	redcify (this_pp, bp, bn, mp, n);
262
263		/* Store b^2 at rp. */
264	0	mpn_sqr (tp, this_pp, n);
265		#if 0
266		if (n == 1) {
267		MPN_REDC_0 (rp, tp, mp, mip[0]);
268		} else
269		#endif
270	0	#if WANT_REDC_2
271	0	if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
272	0	MPN_REDC_1 (rp, tp, mp, n, mip[0]);
273	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
274	0	MPN_REDC_2 (rp, tp, mp, n, mip);
275		#else
276		if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
277		MPN_REDC_1 (rp, tp, mp, n, mip[0]);
278		#endif
279	0	else
280	0	mpn_redc_n (rp, tp, mp, n, mip);
281
282		/* Precompute odd powers of b and put them in the temporary area at pp. */
283	0	for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
284	0	#if 1
285	0	if (n == 1) {
286	0	umul_ppmm((tp)[1], (tp), (this_pp), *(rp));
287	0	++this_pp ;
288	0	MPN_REDC_0 (this_pp, tp, mp, mip[0]);
289	0	} else
290	0	#endif
291	0	{
292	0	mpn_mul_n (tp, this_pp, rp, n);
293	0	this_pp += n;
294	0	#if WANT_REDC_2
295	0	if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
296	0	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
297	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
298	0	MPN_REDC_2 (this_pp, tp, mp, n, mip);
299		#else
300		if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
301		MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
302		#endif
303	0	else
304	0	mpn_redc_n (this_pp, tp, mp, n, mip);
305	0	}
306
307	0	expbits = getbits (ep, ebi, windowsize);
308	0	if (ebi < windowsize)
309	0	ebi = 0;
310	0	else
311	0	ebi -= windowsize;
312
313	0	count_trailing_zeros (cnt, expbits);
314	0	ebi += cnt;
315	0	expbits >>= cnt;
316
317	0	MPN_COPY (rp, pp + n * (expbits >> 1), n);
318
319	0	#define INNERLOOP \
320	0	while (ebi != 0) \
321	0	{ \
322	0	while (getbit (ep, ebi) == 0) \
323	0	{ \
324	0	MPN_SQR (tp, rp, n); \
325	0	MPN_REDUCE (rp, tp, mp, n, mip); \
326	0	if (--ebi == 0) \
327	0	goto done; \
328	0	} \
329	0	\
330		/* The next bit of the exponent is 1. Now extract the largest \
331		block of bits <= windowsize, and such that the least \
332		significant bit is 1. */ \
333	0	\
334	0	expbits = getbits (ep, ebi, windowsize); \
335	0	this_windowsize = windowsize; \
336	0	if (ebi < windowsize) \
337	0	{ \
338	0	this_windowsize -= windowsize - ebi; \
339	0	ebi = 0; \
340	0	} \
341	0	else \
342	0	ebi -= windowsize; \
343	0	\
344	0	count_trailing_zeros (cnt, expbits); \
345	0	this_windowsize -= cnt; \
346	0	ebi += cnt; \
347	0	expbits >>= cnt; \
348	0	\
349	0	do \
350	0	{ \
351	0	MPN_SQR (tp, rp, n); \
352	0	MPN_REDUCE (rp, tp, mp, n, mip); \
353	0	} \
354	0	while (--this_windowsize != 0); \
355	0	\
356	0	MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n); \
357	0	MPN_REDUCE (rp, tp, mp, n, mip); \
358	0	}
359
360
361	0	if (n == 1)
362	0	{
363	0	#undef MPN_MUL_N
364	0	#undef MPN_SQR
365	0	#undef MPN_REDUCE
366	0	#define MPN_MUL_N(r,a,b,n) umul_ppmm((r)[1], (r), (a), *(b))
367	0	#define MPN_SQR(r,a,n) umul_ppmm((r)[1], (r), (a), *(a))
368	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_0(rp, tp, mp, mip[0])
369	0	INNERLOOP;
370	0	}
371	0	else
372	0	#if WANT_REDC_2
373	0	if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
374	0	{
375	0	if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
376	0	{
377	0	if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD
378	0	\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
379	0	{
380	0	#undef MPN_MUL_N
381	0	#undef MPN_SQR
382	0	#undef MPN_REDUCE
383	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
384	0	#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
385	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
386	0	INNERLOOP;
387	0	}
388	0	else
389	0	{
390	0	#undef MPN_MUL_N
391	0	#undef MPN_SQR
392	0	#undef MPN_REDUCE
393	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
394	0	#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
395	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
396	0	INNERLOOP;
397	0	}
398	0	}
399	0	else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
400	0	{
401	0	if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
402	0	\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
403	0	{
404	0	#undef MPN_MUL_N
405	0	#undef MPN_SQR
406	0	#undef MPN_REDUCE
407	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
408	0	#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
409	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
410	0	INNERLOOP;
411	0	}
412	0	else
413	0	{
414	0	#undef MPN_MUL_N
415	0	#undef MPN_SQR
416	0	#undef MPN_REDUCE
417	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
418	0	#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
419	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
420	0	INNERLOOP;
421	0	}
422	0	}
423	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
424	0	{
425	0	#undef MPN_MUL_N
426	0	#undef MPN_SQR
427	0	#undef MPN_REDUCE
428	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
429	0	#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
430	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
431	0	INNERLOOP;
432	0	}
433	0	else
434	0	{
435	0	#undef MPN_MUL_N
436	0	#undef MPN_SQR
437	0	#undef MPN_REDUCE
438	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
439	0	#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
440	0	#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
441	0	INNERLOOP;
442	0	}
443	0	}
444	0	else
445	0	{
446	0	if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
447	0	{
448	0	if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
449	0	\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
450	0	{
451	0	#undef MPN_MUL_N
452	0	#undef MPN_SQR
453	0	#undef MPN_REDUCE
454	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
455	0	#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
456	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
457	0	INNERLOOP;
458	0	}
459	0	else
460	0	{
461	0	#undef MPN_MUL_N
462	0	#undef MPN_SQR
463	0	#undef MPN_REDUCE
464	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
465	0	#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
466	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
467	0	INNERLOOP;
468	0	}
469	0	}
470	0	else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
471	0	{
472	0	#undef MPN_MUL_N
473	0	#undef MPN_SQR
474	0	#undef MPN_REDUCE
475	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
476	0	#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
477	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
478	0	INNERLOOP;
479	0	}
480	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
481	0	{
482	0	#undef MPN_MUL_N
483	0	#undef MPN_SQR
484	0	#undef MPN_REDUCE
485	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
486	0	#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
487	0	#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip)
488	0	INNERLOOP;
489	0	}
490	0	else
491	0	{
492	0	#undef MPN_MUL_N
493	0	#undef MPN_SQR
494	0	#undef MPN_REDUCE
495	0	#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
496	0	#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
497	0	#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
498	0	INNERLOOP;
499	0	}
500	0	}
501
502		#else /* WANT_REDC_2 */
503
504		if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
505		{
506		if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
507		{
508		if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD
509		\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
510		{
511		#undef MPN_MUL_N
512		#undef MPN_SQR
513		#undef MPN_REDUCE
514		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
515		#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
516		#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
517		INNERLOOP;
518		}
519		else
520		{
521		#undef MPN_MUL_N
522		#undef MPN_SQR
523		#undef MPN_REDUCE
524		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
525		#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
526		#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
527		INNERLOOP;
528		}
529		}
530		else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
531		{
532		if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
533		\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
534		{
535		#undef MPN_MUL_N
536		#undef MPN_SQR
537		#undef MPN_REDUCE
538		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
539		#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
540		#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
541		INNERLOOP;
542		}
543		else
544		{
545		#undef MPN_MUL_N
546		#undef MPN_SQR
547		#undef MPN_REDUCE
548		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
549		#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
550		#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
551		INNERLOOP;
552		}
553		}
554		else
555		{
556		#undef MPN_MUL_N
557		#undef MPN_SQR
558		#undef MPN_REDUCE
559		#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
560		#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
561		#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
562		INNERLOOP;
563		}
564		}
565		else
566		{
567		if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
568		{
569		if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
570		\|\| BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
571		{
572		#undef MPN_MUL_N
573		#undef MPN_SQR
574		#undef MPN_REDUCE
575		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
576		#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n)
577		#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
578		INNERLOOP;
579		}
580		else
581		{
582		#undef MPN_MUL_N
583		#undef MPN_SQR
584		#undef MPN_REDUCE
585		#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n)
586		#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n)
587		#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
588		INNERLOOP;
589		}
590		}
591		else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
592		{
593		#undef MPN_MUL_N
594		#undef MPN_SQR
595		#undef MPN_REDUCE
596		#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
597		#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
598		#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0])
599		INNERLOOP;
600		}
601		else
602		{
603		#undef MPN_MUL_N
604		#undef MPN_SQR
605		#undef MPN_REDUCE
606		#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n)
607		#define MPN_SQR(r,a,n) mpn_sqr (r,a,n)
608		#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip)
609		INNERLOOP;
610		}
611		}
612		#endif /* WANT_REDC_2 */
613
614	0	done:
615
616	0	MPN_COPY (tp, rp, n);
617	0	MPN_ZERO (tp + n, n);
618
619	0	#if WANT_REDC_2
620	0	if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
621	0	MPN_REDC_1 (rp, tp, mp, n, mip[0]);
622	0	else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
623	0	MPN_REDC_2 (rp, tp, mp, n, mip);
624		#else
625		if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
626		MPN_REDC_1 (rp, tp, mp, n, mip[0]);
627		#endif
628	0	else
629	0	mpn_redc_n (rp, tp, mp, n, mip);
630
631	0	if (mpn_cmp (rp, mp, n) >= 0)
632	0	mpn_sub_n (rp, rp, mp, n);
633
634	0	TMP_FREE;
635	0	}