/src/gmp-6.2.1/mpz/n_pow_ui.c

Source (jump to first uncovered line)
/* mpz_n_pow_ui -- mpn raised to ulong.

   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   FUTURE GNU MP RELEASES.

Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include <stdlib.h>
#include <stdio.h>
#include "gmp-impl.h"
#include "longlong.h"


/* Change this to "#define TRACE(x) x" for some traces. */
#define TRACE(x)


/* Use this to test the mul_2 code on a CPU without a native version of that
   routine.  */
#if 0
#define mpn_mul_2  refmpn_mul_2
#define HAVE_NATIVE_mpn_mul_2  1
#endif


/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
   bsize==2 or >2, but separating that isn't easy because there's shared
   code both before and after (the size calculations and the powers of 2
   handling).

   Alternatives:

   It would work to just use the mpn_mul powering loop for 1 and 2 limb
   bases, but the current separate loop allows mul_1 and mul_2 to be done
   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
   to allow source==dest when vn==1 or 2 then some pointer twiddling might
   let us get the same effect in one loop.

   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
   form the biggest possible power of b that fits, only the biggest power of
   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
   using mp_bases[b].big_base for small b, and thereby get better value
   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
   so would be more complicated than it's worth, and could well end up being
   a slowdown for small e.  For big e on the other hand the algorithm is
   dominated by mpn_sqr so there wouldn't much of a saving.  The current
   code can be viewed as simply doing the first few steps of the powering in
   a single or double limb where possible.

   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
   copy made of b is unnecessary.  We could just use the old alloc'ed block
   and free it at the end.  But arranging this seems like a lot more trouble
   than it's worth.  */


/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
   a limb without overflowing.
   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */

#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)


/* The following are for convenience, they update the size and check the
   alloc.  */

#define MPN_SQR(dst, alloc, src, size)          \
  do {                                          \
    ASSERT (2*(size) <= (alloc));               \
    mpn_sqr (dst, src, size);                   \
    (size) *= 2;                                \
    (size) -= ((dst)[(size)-1] == 0);           \
  } while (0)

#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
  do {                                                  \
    mp_limb_t  cy;                                      \
    ASSERT ((size) + (size2) <= (alloc));               \
    cy = mpn_mul (dst, src, size, src2, size2);         \
    (size) += (size2) - (cy == 0);                      \
  } while (0)

#define MPN_MUL_2(ptr, size, alloc, mult)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+2 <= (alloc));               \
    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    (size)++;                                   \
    (ptr)[(size)] = cy;                         \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_MUL_1(ptr, size, alloc, limb)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_LSHIFT(ptr, size, alloc, shift)     \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_lshift (ptr, ptr, size, shift);    \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
  do {                                                  \
    if ((shift) == 0)                                   \
      MPN_COPY (dst, src, size);                        \
    else                                                \
      {                                                 \
        mpn_rshift (dst, src, size, shift);             \
        (size) -= ((dst)[(size)-1] == 0);               \
      }                                                 \
  } while (0)


/* ralloc and talloc are only wanted for ASSERTs, after the initial space
   allocations.  Avoid writing values to them in a normal build, to ensure
   the compiler lets them go dead.  gcc already figures this out itself
   actually.  */

#define SWAP_RP_TP                                      \
  do {                                                  \
    MP_PTR_SWAP (rp, tp);                               \
    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
  } while (0)


void
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
{
  mp_ptr         rp;
  mp_size_t      rtwos_limbs, ralloc, rsize;
  int            rneg, i, cnt, btwos, r_bp_overlap;
  mp_limb_t      blimb, rl;
  mp_bitcnt_t    rtwos_bits;
#if HAVE_NATIVE_mpn_mul_2
  mp_limb_t      blimb_low, rl_high;
#else
  mp_limb_t      b_twolimbs[2];
#endif
  mp_limb_t ovfl;
  TMP_DECL;

  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
     PTR(r), bp, bsize, e, e);
   mpn_trace ("b", bp, bsize));

  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));

  /* b^0 == 1, including 0^0 == 1 */
  if (e == 0)
    {
      MPZ_NEWALLOC (r, 1)[0] = 1;
      SIZ(r) = 1;
      return;
    }

  /* 0^e == 0 apart from 0^0 above */
  if (bsize == 0)
    {
      SIZ(r) = 0;
      return;
    }

  /* Sign of the final result. */
  rneg = (bsize < 0 && (e & 1) != 0);
  bsize = ABS (bsize);
  TRACE (printf ("rneg %d\n", rneg));

  r_bp_overlap = (PTR(r) == bp);

  /* Strip low zero limbs from b. */
  rtwos_limbs = 0;
  for (blimb = *bp; blimb == 0; blimb = *++bp)
    {
      rtwos_limbs += e;
      bsize--; ASSERT (bsize >= 1);
    }
  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));

  /* Strip low zero bits from b. */
  count_trailing_zeros (btwos, blimb);
  blimb >>= btwos;

  umul_ppmm (ovfl, rtwos_bits, e, btwos);
  if (ovfl)
    {
      fprintf (stderr, "gmp: overflow in mpz type\n");
      abort ();
    }

  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
  rtwos_bits %= GMP_NUMB_BITS;
  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
     btwos, rtwos_limbs, rtwos_bits));

  TMP_MARK;

  rl = 1;
#if HAVE_NATIVE_mpn_mul_2
  rl_high = 0;
#endif

  if (bsize == 1)
    {
    bsize_1:
      /* Power up as far as possible within blimb.  We start here with e!=0,
   but if e is small then we might reach e==0 and the whole b^e in rl.
   Notice this code works when blimb==1 too, reaching e==0.  */

      while (blimb <= GMP_NUMB_HALFMAX)
  {
    TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
       e, blimb, rl));
    ASSERT (e != 0);
    if ((e & 1) != 0)
      rl *= blimb;
    e >>= 1;
    if (e == 0)
      goto got_rl;
    blimb *= blimb;
  }

#if HAVE_NATIVE_mpn_mul_2
      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Can power b once more into blimb:blimb_low */
      bsize = 2;
      ASSERT (e != 0);
      if ((e & 1) != 0)
  {
    umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    rl >>= GMP_NAIL_BITS;
  }
      e >>= 1;
      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
      blimb_low >>= GMP_NAIL_BITS;

    got_rl:
      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
         e, blimb, blimb_low, rl_high, rl));

      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
   final mul_1 or mul_2 rather than a separate lshift.
   - rl_high:rl mustn't be 1 (since then there's no final mul)
   - rl_high mustn't overflow
   - rl_high mustn't change to non-zero, since mul_1+lshift is
   probably faster than mul_2 (FIXME: is this true?)  */

      if (rtwos_bits != 0
    && ! (rl_high == 0 && rl == 1)
    && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
      | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    if (! (rl_high == 0 && new_rl_high != 0))
      {
        rl_high = new_rl_high;
        rl <<= rtwos_bits;
        rtwos_bits = 0;
        TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
           rl_high, rl));
      }
  }
#else
    got_rl:
      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Combine left-over rtwos_bits into rl to be handled by the final
   mul_1 rather than a separate lshift.
   - rl mustn't be 1 (since then there's no final mul)
   - rl mustn't overflow  */

      if (rtwos_bits != 0
    && rl != 1
    && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    rl <<= rtwos_bits;
    rtwos_bits = 0;
    TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
  }
#endif
    }
  else if (bsize == 2)
    {
      mp_limb_t  bsecond = bp[1];
      if (btwos != 0)
  blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
      bsecond >>= btwos;
      if (bsecond == 0)
  {
    /* Two limbs became one after rshift. */
    bsize = 1;
    goto bsize_1;
  }

      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
#if HAVE_NATIVE_mpn_mul_2
      blimb_low = blimb;
#else
      bp = b_twolimbs;
      b_twolimbs[0] = blimb;
      b_twolimbs[1] = bsecond;
#endif
      blimb = bsecond;
    }
  else
    {
      if (r_bp_overlap || btwos != 0)
  {
    mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    bp = tp;
    TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
  }
#if HAVE_NATIVE_mpn_mul_2
      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
      blimb_low = bp[0];
#endif
      blimb = bp[bsize-1];

      TRACE (printf ("big bsize=%ld  ", bsize);
       mpn_trace ("b", bp, bsize));
    }

  /* At this point blimb is the most significant limb of the base to use.

     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
     limb to round up the division; +1 for multiplies all using an extra
     limb over the true size; +2 for rl at the end; +1 for lshift at the
     end.

     The size calculation here is reasonably accurate.  The base is at least
     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
     when it will power up as just over 16, an overestimate of 17/16 =
     6.25%.  For a 64-bit limb it's half that.

     If e==0 then blimb won't be anything useful (though it will be
     non-zero), but that doesn't matter since we just end up with ralloc==5,
     and that's fine for 2 limbs of rl and 1 of lshift.  */

  ASSERT (blimb != 0);
  count_leading_zeros (cnt, blimb);

  umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
  if (ovfl)
    {
      fprintf (stderr, "gmp: overflow in mpz type\n");
      abort ();
    }
  ralloc = ralloc / GMP_NUMB_BITS + 5;

  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
     ralloc, bsize, blimb, cnt));
  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);

  /* Low zero limbs resulting from powers of 2. */
  MPN_ZERO (rp, rtwos_limbs);
  rp += rtwos_limbs;

  if (e == 0)
    {
      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
   start. */
      rp[0] = rl;
      rsize = 1;
#if HAVE_NATIVE_mpn_mul_2
      rp[1] = rl_high;
      rsize += (rl_high != 0);
#endif
      ASSERT (rp[rsize-1] != 0);
    }
  else
    {
      mp_ptr     tp;
      mp_size_t  talloc;

      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
   low bit of e is zero, tp only has to hold the second last power
   step, which is half the size of the final result.  There's no need
   to round up the divide by 2, since ralloc includes a +2 for rl
   which not needed by tp.  In the mpn_mul loop when the low bit of e
   is 1, tp must hold nearly the full result, so just size it the same
   as rp.  */

      talloc = ralloc;
#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2 || (e & 1) == 0)
  talloc /= 2;
#else
      if (bsize <= 1 || (e & 1) == 0)
  talloc /= 2;
#endif
      TRACE (printf ("talloc %ld\n", talloc));
      tp = TMP_ALLOC_LIMBS (talloc);

      /* Go from high to low over the bits of e, starting with i pointing at
   the bit below the highest 1 (which will mean i==-1 if e==1).  */
      count_leading_zeros (cnt, (mp_limb_t) e);
      i = GMP_LIMB_BITS - cnt - 2;

#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2)
  {
    mp_limb_t  mult[2];

    /* Any bsize==1 will have been powered above to be two limbs. */
    ASSERT (bsize == 2);
    ASSERT (blimb != 0);

    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb_low;
    rp[1] = blimb;
    rsize = 2;

    mult[0] = blimb_low;
    mult[1] = blimb;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_2 (rp, rsize, ralloc, mult);
      }

    TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    if (rl_high != 0)
      {
        mult[0] = rl;
        mult[1] = rl_high;
        MPN_MUL_2 (rp, rsize, ralloc, mult);
      }
    else if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#else
      if (bsize == 1)
  {
    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb;
    rsize = 1;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_1 (rp, rsize, ralloc, blimb);
      }

    TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#endif
      else
  {
    int  parity;

    /* Arrange the final result ends up in r, not in the temp space */
    ULONG_PARITY (parity, e);
    if (((parity ^ i) & 1) != 0)
      SWAP_RP_TP;

    MPN_COPY (rp, bp, bsize);
    rsize = bsize;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    {
      MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
      SWAP_RP_TP;
    }
      }
  }
    }

  ASSERT (rp == PTR(r) + rtwos_limbs);
  TRACE (mpn_trace ("end loop r", rp, rsize));
  TMP_FREE;

  /* Apply any partial limb factors of 2. */
  if (rtwos_bits != 0)
    {
      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
      TRACE (mpn_trace ("lshift r", rp, rsize));
    }

  rsize += rtwos_limbs;
  SIZ(r) = (rneg ? -rsize : rsize);
}

Coverage Report

Created: 2023-02-22 06:39

Line	Count	Source (jump to first uncovered line)
1		/* mpz_n_pow_ui -- mpn raised to ulong.
2
3		THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4		CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5		FUTURE GNU MP RELEASES.
6
7		Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9		This file is part of the GNU MP Library.
10
11		The GNU MP Library is free software; you can redistribute it and/or modify
12		it under the terms of either:
13
14		* the GNU Lesser General Public License as published by the Free
15		Software Foundation; either version 3 of the License, or (at your
16		option) any later version.
17
18		or
19
20		* the GNU General Public License as published by the Free Software
21		Foundation; either version 2 of the License, or (at your option) any
22		later version.
23
24		or both in parallel, as here.
25
26		The GNU MP Library is distributed in the hope that it will be useful, but
27		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29		for more details.
30
31		You should have received copies of the GNU General Public License and the
32		GNU Lesser General Public License along with the GNU MP Library. If not,
33		see https://www.gnu.org/licenses/. */
34
35		#include <stdlib.h>
36		#include <stdio.h>
37		#include "gmp-impl.h"
38		#include "longlong.h"
39
40
41		/* Change this to "#define TRACE(x) x" for some traces. */
42		#define TRACE(x)
43
44
45		/* Use this to test the mul_2 code on a CPU without a native version of that
46		routine. */
47		#if 0
48		#define mpn_mul_2 refmpn_mul_2
49		#define HAVE_NATIVE_mpn_mul_2 1
50		#endif
51
52
53		/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54		ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55		bsize==2 or >2, but separating that isn't easy because there's shared
56		code both before and after (the size calculations and the powers of 2
57		handling).
58
59		Alternatives:
60
61		It would work to just use the mpn_mul powering loop for 1 and 2 limb
62		bases, but the current separate loop allows mul_1 and mul_2 to be done
63		in-place, which might help cache locality a bit. If mpn_mul was relaxed
64		to allow source==dest when vn==1 or 2 then some pointer twiddling might
65		let us get the same effect in one loop.
66
67		The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68		form the biggest possible power of b that fits, only the biggest power of
69		2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
70		using mp_bases[b].big_base for small b, and thereby get better value
71		from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
72		so would be more complicated than it's worth, and could well end up being
73		a slowdown for small e. For big e on the other hand the algorithm is
74		dominated by mpn_sqr so there wouldn't much of a saving. The current
75		code can be viewed as simply doing the first few steps of the powering in
76		a single or double limb where possible.
77
78		If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79		copy made of b is unnecessary. We could just use the old alloc'ed block
80		and free it at the end. But arranging this seems like a lot more trouble
81		than it's worth. */
82
83
84		/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85		a limb without overflowing.
86		FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88	0	#define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91		/* The following are for convenience, they update the size and check the
92		alloc. */
93
94		#define MPN_SQR(dst, alloc, src, size) \
95	0	do { \
96	0	ASSERT (2*(size) <= (alloc)); \
97	0	mpn_sqr (dst, src, size); \
98	0	(size) *= 2; \
99	0	(size) -= ((dst)[(size)-1] == 0); \
100	0	} while (0)
101
102		#define MPN_MUL(dst, alloc, src, size, src2, size2) \
103	0	do { \
104	0	mp_limb_t cy; \
105	0	ASSERT ((size) + (size2) <= (alloc)); \
106	0	cy = mpn_mul (dst, src, size, src2, size2); \
107	0	(size) += (size2) - (cy == 0); \
108	0	} while (0)
109
110		#define MPN_MUL_2(ptr, size, alloc, mult) \
111	0	do { \
112	0	mp_limb_t cy; \
113	0	ASSERT ((size)+2 <= (alloc)); \
114	0	cy = mpn_mul_2 (ptr, ptr, size, mult); \
115	0	(size)++; \
116	0	(ptr)[(size)] = cy; \
117	0	(size) += (cy != 0); \
118	0	} while (0)
119
120		#define MPN_MUL_1(ptr, size, alloc, limb) \
121	0	do { \
122	0	mp_limb_t cy; \
123	0	ASSERT ((size)+1 <= (alloc)); \
124	0	cy = mpn_mul_1 (ptr, ptr, size, limb); \
125	0	(ptr)[size] = cy; \
126	0	(size) += (cy != 0); \
127	0	} while (0)
128
129		#define MPN_LSHIFT(ptr, size, alloc, shift) \
130	0	do { \
131	0	mp_limb_t cy; \
132	0	ASSERT ((size)+1 <= (alloc)); \
133	0	cy = mpn_lshift (ptr, ptr, size, shift); \
134	0	(ptr)[size] = cy; \
135	0	(size) += (cy != 0); \
136	0	} while (0)
137
138		#define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
139	0	do { \
140	0	if ((shift) == 0) \
141	0	MPN_COPY (dst, src, size); \
142	0	else \
143	0	{ \
144	0	mpn_rshift (dst, src, size, shift); \
145	0	(size) -= ((dst)[(size)-1] == 0); \
146	0	} \
147	0	} while (0)
148
149
150		/* ralloc and talloc are only wanted for ASSERTs, after the initial space
151		allocations. Avoid writing values to them in a normal build, to ensure
152		the compiler lets them go dead. gcc already figures this out itself
153		actually. */
154
155		#define SWAP_RP_TP \
156	0	do { \
157	0	MP_PTR_SWAP (rp, tp); \
158	0	ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
159	0	} while (0)
160
161
162		void
163		mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164	0	{
165	0	mp_ptr rp;
166	0	mp_size_t rtwos_limbs, ralloc, rsize;
167	0	int rneg, i, cnt, btwos, r_bp_overlap;
168	0	mp_limb_t blimb, rl;
169	0	mp_bitcnt_t rtwos_bits;
170	0	#if HAVE_NATIVE_mpn_mul_2
171	0	mp_limb_t blimb_low, rl_high;
172		#else
173		mp_limb_t b_twolimbs[2];
174		#endif
175	0	mp_limb_t ovfl;
176	0	TMP_DECL;
177
178	0	TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179	0	PTR(r), bp, bsize, e, e);
180	0	mpn_trace ("b", bp, bsize));
181
182	0	ASSERT (bsize == 0 \|\| bp[ABS(bsize)-1] != 0);
183	0	ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185		/* b^0 == 1, including 0^0 == 1 */
186	0	if (e == 0)
187	0	{
188	0	MPZ_NEWALLOC (r, 1)[0] = 1;
189	0	SIZ(r) = 1;
190	0	return;
191	0	}
192
193		/* 0^e == 0 apart from 0^0 above */
194	0	if (bsize == 0)
195	0	{
196	0	SIZ(r) = 0;
197	0	return;
198	0	}
199
200		/* Sign of the final result. */
201	0	rneg = (bsize < 0 && (e & 1) != 0);
202	0	bsize = ABS (bsize);
203	0	TRACE (printf ("rneg %d\n", rneg));
204
205	0	r_bp_overlap = (PTR(r) == bp);
206
207		/* Strip low zero limbs from b. */
208	0	rtwos_limbs = 0;
209	0	for (blimb = bp; blimb == 0; blimb = ++bp)
210	0	{
211	0	rtwos_limbs += e;
212	0	bsize--; ASSERT (bsize >= 1);
213	0	}
214	0	TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216		/* Strip low zero bits from b. */
217	0	count_trailing_zeros (btwos, blimb);
218	0	blimb >>= btwos;
219
220	0	umul_ppmm (ovfl, rtwos_bits, e, btwos);
221	0	if (ovfl)
222	0	{
223	0	fprintf (stderr, "gmp: overflow in mpz type\n");
224	0	abort ();
225	0	}
226
227	0	rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228	0	rtwos_bits %= GMP_NUMB_BITS;
229	0	TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230	0	btwos, rtwos_limbs, rtwos_bits));
231
232	0	TMP_MARK;
233
234	0	rl = 1;
235	0	#if HAVE_NATIVE_mpn_mul_2
236	0	rl_high = 0;
237	0	#endif
238
239	0	if (bsize == 1)
240	0	{
241	0	bsize_1:
242		/* Power up as far as possible within blimb. We start here with e!=0,
243		but if e is small then we might reach e==0 and the whole b^e in rl.
244		Notice this code works when blimb==1 too, reaching e==0. */
245
246	0	while (blimb <= GMP_NUMB_HALFMAX)
247	0	{
248	0	TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249	0	e, blimb, rl));
250	0	ASSERT (e != 0);
251	0	if ((e & 1) != 0)
252	0	rl *= blimb;
253	0	e >>= 1;
254	0	if (e == 0)
255	0	goto got_rl;
256	0	blimb *= blimb;
257	0	}
258
259	0	#if HAVE_NATIVE_mpn_mul_2
260	0	TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261	0	e, blimb, rl));
262
263		/* Can power b once more into blimb:blimb_low */
264	0	bsize = 2;
265	0	ASSERT (e != 0);
266	0	if ((e & 1) != 0)
267	0	{
268	0	umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269	0	rl >>= GMP_NAIL_BITS;
270	0	}
271	0	e >>= 1;
272	0	umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273	0	blimb_low >>= GMP_NAIL_BITS;
274
275	0	got_rl:
276	0	TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277	0	e, blimb, blimb_low, rl_high, rl));
278
279		/* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280		final mul_1 or mul_2 rather than a separate lshift.
281		- rl_high:rl mustn't be 1 (since then there's no final mul)
282		- rl_high mustn't overflow
283		- rl_high mustn't change to non-zero, since mul_1+lshift is
284		probably faster than mul_2 (FIXME: is this true?) */
285
286	0	if (rtwos_bits != 0
287	0	&& ! (rl_high == 0 && rl == 1)
288	0	&& (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289	0	{
290	0	mp_limb_t new_rl_high = (rl_high << rtwos_bits)
291	0	\| (rl >> (GMP_NUMB_BITS-rtwos_bits));
292	0	if (! (rl_high == 0 && new_rl_high != 0))
293	0	{
294	0	rl_high = new_rl_high;
295	0	rl <<= rtwos_bits;
296	0	rtwos_bits = 0;
297	0	TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298	0	rl_high, rl));
299	0	}
300	0	}
301		#else
302		got_rl:
303		TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304		e, blimb, rl));
305
306		/* Combine left-over rtwos_bits into rl to be handled by the final
307		mul_1 rather than a separate lshift.
308		- rl mustn't be 1 (since then there's no final mul)
309		- rl mustn't overflow */
310
311		if (rtwos_bits != 0
312		&& rl != 1
313		&& (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314		{
315		rl <<= rtwos_bits;
316		rtwos_bits = 0;
317		TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318		}
319		#endif
320	0	}
321	0	else if (bsize == 2)
322	0	{
323	0	mp_limb_t bsecond = bp[1];
324	0	if (btwos != 0)
325	0	blimb \|= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326	0	bsecond >>= btwos;
327	0	if (bsecond == 0)
328	0	{
329		/* Two limbs became one after rshift. */
330	0	bsize = 1;
331	0	goto bsize_1;
332	0	}
333
334	0	TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335	0	#if HAVE_NATIVE_mpn_mul_2
336	0	blimb_low = blimb;
337		#else
338		bp = b_twolimbs;
339		b_twolimbs[0] = blimb;
340		b_twolimbs[1] = bsecond;
341		#endif
342	0	blimb = bsecond;
343	0	}
344	0	else
345	0	{
346	0	if (r_bp_overlap \|\| btwos != 0)
347	0	{
348	0	mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349	0	MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350	0	bp = tp;
351	0	TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352	0	}
353	0	#if HAVE_NATIVE_mpn_mul_2
354		/* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355	0	blimb_low = bp[0];
356	0	#endif
357	0	blimb = bp[bsize-1];
358
359	0	TRACE (printf ("big bsize=%ld ", bsize);
360	0	mpn_trace ("b", bp, bsize));
361	0	}
362
363		/* At this point blimb is the most significant limb of the base to use.
364
365		Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366		limb to round up the division; +1 for multiplies all using an extra
367		limb over the true size; +2 for rl at the end; +1 for lshift at the
368		end.
369
370		The size calculation here is reasonably accurate. The base is at least
371		half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372		when it will power up as just over 16, an overestimate of 17/16 =
373		6.25%. For a 64-bit limb it's half that.
374
375		If e==0 then blimb won't be anything useful (though it will be
376		non-zero), but that doesn't matter since we just end up with ralloc==5,
377		and that's fine for 2 limbs of rl and 1 of lshift. */
378
379	0	ASSERT (blimb != 0);
380	0	count_leading_zeros (cnt, blimb);
381
382	0	umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383	0	if (ovfl)
384	0	{
385	0	fprintf (stderr, "gmp: overflow in mpz type\n");
386	0	abort ();
387	0	}
388	0	ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390	0	TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391	0	ralloc, bsize, blimb, cnt));
392	0	rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394		/* Low zero limbs resulting from powers of 2. */
395	0	MPN_ZERO (rp, rtwos_limbs);
396	0	rp += rtwos_limbs;
397
398	0	if (e == 0)
399	0	{
400		/* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401		start. */
402	0	rp[0] = rl;
403	0	rsize = 1;
404	0	#if HAVE_NATIVE_mpn_mul_2
405	0	rp[1] = rl_high;
406	0	rsize += (rl_high != 0);
407	0	#endif
408	0	ASSERT (rp[rsize-1] != 0);
409	0	}
410	0	else
411	0	{
412	0	mp_ptr tp;
413	0	mp_size_t talloc;
414
415		/* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416		low bit of e is zero, tp only has to hold the second last power
417		step, which is half the size of the final result. There's no need
418		to round up the divide by 2, since ralloc includes a +2 for rl
419		which not needed by tp. In the mpn_mul loop when the low bit of e
420		is 1, tp must hold nearly the full result, so just size it the same
421		as rp. */
422
423	0	talloc = ralloc;
424	0	#if HAVE_NATIVE_mpn_mul_2
425	0	if (bsize <= 2 \|\| (e & 1) == 0)
426	0	talloc /= 2;
427		#else
428		if (bsize <= 1 \|\| (e & 1) == 0)
429		talloc /= 2;
430		#endif
431	0	TRACE (printf ("talloc %ld\n", talloc));
432	0	tp = TMP_ALLOC_LIMBS (talloc);
433
434		/* Go from high to low over the bits of e, starting with i pointing at
435		the bit below the highest 1 (which will mean i==-1 if e==1). */
436	0	count_leading_zeros (cnt, (mp_limb_t) e);
437	0	i = GMP_LIMB_BITS - cnt - 2;
438
439	0	#if HAVE_NATIVE_mpn_mul_2
440	0	if (bsize <= 2)
441	0	{
442	0	mp_limb_t mult[2];
443
444		/* Any bsize==1 will have been powered above to be two limbs. */
445	0	ASSERT (bsize == 2);
446	0	ASSERT (blimb != 0);
447
448		/* Arrange the final result ends up in r, not in the temp space */
449	0	if ((i & 1) == 0)
450	0	SWAP_RP_TP;
451
452	0	rp[0] = blimb_low;
453	0	rp[1] = blimb;
454	0	rsize = 2;
455
456	0	mult[0] = blimb_low;
457	0	mult[1] = blimb;
458
459	0	for ( ; i >= 0; i--)
460	0	{
461	0	TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462	0	i, e, rsize, ralloc, talloc);
463	0	mpn_trace ("r", rp, rsize));
464
465	0	MPN_SQR (tp, talloc, rp, rsize);
466	0	SWAP_RP_TP;
467	0	if ((e & (1L << i)) != 0)
468	0	MPN_MUL_2 (rp, rsize, ralloc, mult);
469	0	}
470
471	0	TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472	0	if (rl_high != 0)
473	0	{
474	0	mult[0] = rl;
475	0	mult[1] = rl_high;
476	0	MPN_MUL_2 (rp, rsize, ralloc, mult);
477	0	}
478	0	else if (rl != 1)
479	0	MPN_MUL_1 (rp, rsize, ralloc, rl);
480	0	}
481		#else
482		if (bsize == 1)
483		{
484		/* Arrange the final result ends up in r, not in the temp space */
485		if ((i & 1) == 0)
486		SWAP_RP_TP;
487
488		rp[0] = blimb;
489		rsize = 1;
490
491		for ( ; i >= 0; i--)
492		{
493		TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494		i, e, rsize, ralloc, talloc);
495		mpn_trace ("r", rp, rsize));
496
497		MPN_SQR (tp, talloc, rp, rsize);
498		SWAP_RP_TP;
499		if ((e & (1L << i)) != 0)
500		MPN_MUL_1 (rp, rsize, ralloc, blimb);
501		}
502
503		TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504		if (rl != 1)
505		MPN_MUL_1 (rp, rsize, ralloc, rl);
506		}
507		#endif
508	0	else
509	0	{
510	0	int parity;
511
512		/* Arrange the final result ends up in r, not in the temp space */
513	0	ULONG_PARITY (parity, e);
514	0	if (((parity ^ i) & 1) != 0)
515	0	SWAP_RP_TP;
516
517	0	MPN_COPY (rp, bp, bsize);
518	0	rsize = bsize;
519
520	0	for ( ; i >= 0; i--)
521	0	{
522	0	TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523	0	i, e, rsize, ralloc, talloc);
524	0	mpn_trace ("r", rp, rsize));
525
526	0	MPN_SQR (tp, talloc, rp, rsize);
527	0	SWAP_RP_TP;
528	0	if ((e & (1L << i)) != 0)
529	0	{
530	0	MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531	0	SWAP_RP_TP;
532	0	}
533	0	}
534	0	}
535	0	}
536
537	0	ASSERT (rp == PTR(r) + rtwos_limbs);
538	0	TRACE (mpn_trace ("end loop r", rp, rsize));
539	0	TMP_FREE;
540
541		/* Apply any partial limb factors of 2. */
542	0	if (rtwos_bits != 0)
543	0	{
544	0	MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545	0	TRACE (mpn_trace ("lshift r", rp, rsize));
546	0	}
547
548	0	rsize += rtwos_limbs;
549	0	SIZ(r) = (rneg ? -rsize : rsize);
550	0	}