/src/gmp-6.2.1/mpz/n_pow_ui.c

Source (jump to first uncovered line)
/* mpz_n_pow_ui -- mpn raised to ulong.

   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   FUTURE GNU MP RELEASES.

Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include <stdlib.h>
#include <stdio.h>
#include "gmp-impl.h"
#include "longlong.h"


/* Change this to "#define TRACE(x) x" for some traces. */
#define TRACE(x)


/* Use this to test the mul_2 code on a CPU without a native version of that
   routine.  */
#if 0
#define mpn_mul_2  refmpn_mul_2
#define HAVE_NATIVE_mpn_mul_2  1
#endif


/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
   bsize==2 or >2, but separating that isn't easy because there's shared
   code both before and after (the size calculations and the powers of 2
   handling).

   Alternatives:

   It would work to just use the mpn_mul powering loop for 1 and 2 limb
   bases, but the current separate loop allows mul_1 and mul_2 to be done
   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
   to allow source==dest when vn==1 or 2 then some pointer twiddling might
   let us get the same effect in one loop.

   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
   form the biggest possible power of b that fits, only the biggest power of
   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
   using mp_bases[b].big_base for small b, and thereby get better value
   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
   so would be more complicated than it's worth, and could well end up being
   a slowdown for small e.  For big e on the other hand the algorithm is
   dominated by mpn_sqr so there wouldn't much of a saving.  The current
   code can be viewed as simply doing the first few steps of the powering in
   a single or double limb where possible.

   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
   copy made of b is unnecessary.  We could just use the old alloc'ed block
   and free it at the end.  But arranging this seems like a lot more trouble
   than it's worth.  */


/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
   a limb without overflowing.
   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */

#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)


/* The following are for convenience, they update the size and check the
   alloc.  */

#define MPN_SQR(dst, alloc, src, size)          \
  do {                                          \
    ASSERT (2*(size) <= (alloc));               \
    mpn_sqr (dst, src, size);                   \
    (size) *= 2;                                \
    (size) -= ((dst)[(size)-1] == 0);           \
  } while (0)

#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
  do {                                                  \
    mp_limb_t  cy;                                      \
    ASSERT ((size) + (size2) <= (alloc));               \
    cy = mpn_mul (dst, src, size, src2, size2);         \
    (size) += (size2) - (cy == 0);                      \
  } while (0)

#define MPN_MUL_2(ptr, size, alloc, mult)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+2 <= (alloc));               \
    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    (size)++;                                   \
    (ptr)[(size)] = cy;                         \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_MUL_1(ptr, size, alloc, limb)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_LSHIFT(ptr, size, alloc, shift)     \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_lshift (ptr, ptr, size, shift);    \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
  do {                                                  \
    if ((shift) == 0)                                   \
      MPN_COPY (dst, src, size);                        \
    else                                                \
      {                                                 \
        mpn_rshift (dst, src, size, shift);             \
        (size) -= ((dst)[(size)-1] == 0);               \
      }                                                 \
  } while (0)


/* ralloc and talloc are only wanted for ASSERTs, after the initial space
   allocations.  Avoid writing values to them in a normal build, to ensure
   the compiler lets them go dead.  gcc already figures this out itself
   actually.  */

#define SWAP_RP_TP                                      \
  do {                                                  \
    MP_PTR_SWAP (rp, tp);                               \
    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
  } while (0)


void
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
{
  mp_ptr         rp;
  mp_size_t      rtwos_limbs, ralloc, rsize;
  int            rneg, i, cnt, btwos, r_bp_overlap;
  mp_limb_t      blimb, rl;
  mp_bitcnt_t    rtwos_bits;
#if HAVE_NATIVE_mpn_mul_2
  mp_limb_t      blimb_low, rl_high;
#else
  mp_limb_t      b_twolimbs[2];
#endif
  mp_limb_t ovfl;
  TMP_DECL;

  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
     PTR(r), bp, bsize, e, e);
   mpn_trace ("b", bp, bsize));

  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));

  /* b^0 == 1, including 0^0 == 1 */
  if (e == 0)
    {
      MPZ_NEWALLOC (r, 1)[0] = 1;
      SIZ(r) = 1;
      return;
    }

  /* 0^e == 0 apart from 0^0 above */
  if (bsize == 0)
    {
      SIZ(r) = 0;
      return;
    }

  /* Sign of the final result. */
  rneg = (bsize < 0 && (e & 1) != 0);
  bsize = ABS (bsize);
  TRACE (printf ("rneg %d\n", rneg));

  r_bp_overlap = (PTR(r) == bp);

  /* Strip low zero limbs from b. */
  rtwos_limbs = 0;
  for (blimb = *bp; blimb == 0; blimb = *++bp)
    {
      rtwos_limbs += e;
      bsize--; ASSERT (bsize >= 1);
    }
  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));

  /* Strip low zero bits from b. */
  count_trailing_zeros (btwos, blimb);
  blimb >>= btwos;

  umul_ppmm (ovfl, rtwos_bits, e, btwos);
  if (ovfl)
    {
      fprintf (stderr, "gmp: overflow in mpz type\n");
      abort ();
    }

  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
  rtwos_bits %= GMP_NUMB_BITS;
  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
     btwos, rtwos_limbs, rtwos_bits));

  TMP_MARK;

  rl = 1;
#if HAVE_NATIVE_mpn_mul_2
  rl_high = 0;
#endif

  if (bsize == 1)
    {
    bsize_1:
      /* Power up as far as possible within blimb.  We start here with e!=0,
   but if e is small then we might reach e==0 and the whole b^e in rl.
   Notice this code works when blimb==1 too, reaching e==0.  */

      while (blimb <= GMP_NUMB_HALFMAX)
  {
    TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
       e, blimb, rl));
    ASSERT (e != 0);
    if ((e & 1) != 0)
      rl *= blimb;
    e >>= 1;
    if (e == 0)
      goto got_rl;
    blimb *= blimb;
  }

#if HAVE_NATIVE_mpn_mul_2
      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Can power b once more into blimb:blimb_low */
      bsize = 2;
      ASSERT (e != 0);
      if ((e & 1) != 0)
  {
    umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    rl >>= GMP_NAIL_BITS;
  }
      e >>= 1;
      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
      blimb_low >>= GMP_NAIL_BITS;

    got_rl:
      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
         e, blimb, blimb_low, rl_high, rl));

      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
   final mul_1 or mul_2 rather than a separate lshift.
   - rl_high:rl mustn't be 1 (since then there's no final mul)
   - rl_high mustn't overflow
   - rl_high mustn't change to non-zero, since mul_1+lshift is
   probably faster than mul_2 (FIXME: is this true?)  */

      if (rtwos_bits != 0
    && ! (rl_high == 0 && rl == 1)
    && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
      | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    if (! (rl_high == 0 && new_rl_high != 0))
      {
        rl_high = new_rl_high;
        rl <<= rtwos_bits;
        rtwos_bits = 0;
        TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
           rl_high, rl));
      }
  }
#else
    got_rl:
      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Combine left-over rtwos_bits into rl to be handled by the final
   mul_1 rather than a separate lshift.
   - rl mustn't be 1 (since then there's no final mul)
   - rl mustn't overflow  */

      if (rtwos_bits != 0
    && rl != 1
    && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    rl <<= rtwos_bits;
    rtwos_bits = 0;
    TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
  }
#endif
    }
  else if (bsize == 2)
    {
      mp_limb_t  bsecond = bp[1];
      if (btwos != 0)
  blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
      bsecond >>= btwos;
      if (bsecond == 0)
  {
    /* Two limbs became one after rshift. */
    bsize = 1;
    goto bsize_1;
  }

      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
#if HAVE_NATIVE_mpn_mul_2
      blimb_low = blimb;
#else
      bp = b_twolimbs;
      b_twolimbs[0] = blimb;
      b_twolimbs[1] = bsecond;
#endif
      blimb = bsecond;
    }
  else
    {
      if (r_bp_overlap || btwos != 0)
  {
    mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    bp = tp;
    TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
  }
#if HAVE_NATIVE_mpn_mul_2
      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
      blimb_low = bp[0];
#endif
      blimb = bp[bsize-1];

      TRACE (printf ("big bsize=%ld  ", bsize);
       mpn_trace ("b", bp, bsize));
    }

  /* At this point blimb is the most significant limb of the base to use.

     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
     limb to round up the division; +1 for multiplies all using an extra
     limb over the true size; +2 for rl at the end; +1 for lshift at the
     end.

     The size calculation here is reasonably accurate.  The base is at least
     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
     when it will power up as just over 16, an overestimate of 17/16 =
     6.25%.  For a 64-bit limb it's half that.

     If e==0 then blimb won't be anything useful (though it will be
     non-zero), but that doesn't matter since we just end up with ralloc==5,
     and that's fine for 2 limbs of rl and 1 of lshift.  */

  ASSERT (blimb != 0);
  count_leading_zeros (cnt, blimb);

  umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
  if (ovfl)
    {
      fprintf (stderr, "gmp: overflow in mpz type\n");
      abort ();
    }
  ralloc = ralloc / GMP_NUMB_BITS + 5;

  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
     ralloc, bsize, blimb, cnt));
  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);

  /* Low zero limbs resulting from powers of 2. */
  MPN_ZERO (rp, rtwos_limbs);
  rp += rtwos_limbs;

  if (e == 0)
    {
      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
   start. */
      rp[0] = rl;
      rsize = 1;
#if HAVE_NATIVE_mpn_mul_2
      rp[1] = rl_high;
      rsize += (rl_high != 0);
#endif
      ASSERT (rp[rsize-1] != 0);
    }
  else
    {
      mp_ptr     tp;
      mp_size_t  talloc;

      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
   low bit of e is zero, tp only has to hold the second last power
   step, which is half the size of the final result.  There's no need
   to round up the divide by 2, since ralloc includes a +2 for rl
   which not needed by tp.  In the mpn_mul loop when the low bit of e
   is 1, tp must hold nearly the full result, so just size it the same
   as rp.  */

      talloc = ralloc;
#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2 || (e & 1) == 0)
  talloc /= 2;
#else
      if (bsize <= 1 || (e & 1) == 0)
  talloc /= 2;
#endif
      TRACE (printf ("talloc %ld\n", talloc));
      tp = TMP_ALLOC_LIMBS (talloc);

      /* Go from high to low over the bits of e, starting with i pointing at
   the bit below the highest 1 (which will mean i==-1 if e==1).  */
      count_leading_zeros (cnt, (mp_limb_t) e);
      i = GMP_LIMB_BITS - cnt - 2;

#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2)
  {
    mp_limb_t  mult[2];

    /* Any bsize==1 will have been powered above to be two limbs. */
    ASSERT (bsize == 2);
    ASSERT (blimb != 0);

    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb_low;
    rp[1] = blimb;
    rsize = 2;

    mult[0] = blimb_low;
    mult[1] = blimb;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_2 (rp, rsize, ralloc, mult);
      }

    TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    if (rl_high != 0)
      {
        mult[0] = rl;
        mult[1] = rl_high;
        MPN_MUL_2 (rp, rsize, ralloc, mult);
      }
    else if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#else
      if (bsize == 1)
  {
    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb;
    rsize = 1;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_1 (rp, rsize, ralloc, blimb);
      }

    TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#endif
      else
  {
    int  parity;

    /* Arrange the final result ends up in r, not in the temp space */
    ULONG_PARITY (parity, e);
    if (((parity ^ i) & 1) != 0)
      SWAP_RP_TP;

    MPN_COPY (rp, bp, bsize);
    rsize = bsize;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    {
      MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
      SWAP_RP_TP;
    }
      }
  }
    }

  ASSERT (rp == PTR(r) + rtwos_limbs);
  TRACE (mpn_trace ("end loop r", rp, rsize));
  TMP_FREE;

  /* Apply any partial limb factors of 2. */
  if (rtwos_bits != 0)
    {
      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
      TRACE (mpn_trace ("lshift r", rp, rsize));
    }

  rsize += rtwos_limbs;
  SIZ(r) = (rneg ? -rsize : rsize);
}

Coverage Report

Created: 2025-03-09 06:52

Line	Count	Source (jump to first uncovered line)
1		/* mpz_n_pow_ui -- mpn raised to ulong.
2
3		THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4		CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5		FUTURE GNU MP RELEASES.
6
7		Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9		This file is part of the GNU MP Library.
10
11		The GNU MP Library is free software; you can redistribute it and/or modify
12		it under the terms of either:
13
14		* the GNU Lesser General Public License as published by the Free
15		Software Foundation; either version 3 of the License, or (at your
16		option) any later version.
17
18		or
19
20		* the GNU General Public License as published by the Free Software
21		Foundation; either version 2 of the License, or (at your option) any
22		later version.
23
24		or both in parallel, as here.
25
26		The GNU MP Library is distributed in the hope that it will be useful, but
27		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29		for more details.
30
31		You should have received copies of the GNU General Public License and the
32		GNU Lesser General Public License along with the GNU MP Library. If not,
33		see https://www.gnu.org/licenses/. */
34
35		#include <stdlib.h>
36		#include <stdio.h>
37		#include "gmp-impl.h"
38		#include "longlong.h"
39
40
41		/* Change this to "#define TRACE(x) x" for some traces. */
42		#define TRACE(x)
43
44
45		/* Use this to test the mul_2 code on a CPU without a native version of that
46		routine. */
47		#if 0
48		#define mpn_mul_2 refmpn_mul_2
49		#define HAVE_NATIVE_mpn_mul_2 1
50		#endif
51
52
53		/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54		ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55		bsize==2 or >2, but separating that isn't easy because there's shared
56		code both before and after (the size calculations and the powers of 2
57		handling).
58
59		Alternatives:
60
61		It would work to just use the mpn_mul powering loop for 1 and 2 limb
62		bases, but the current separate loop allows mul_1 and mul_2 to be done
63		in-place, which might help cache locality a bit. If mpn_mul was relaxed
64		to allow source==dest when vn==1 or 2 then some pointer twiddling might
65		let us get the same effect in one loop.
66
67		The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68		form the biggest possible power of b that fits, only the biggest power of
69		2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
70		using mp_bases[b].big_base for small b, and thereby get better value
71		from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
72		so would be more complicated than it's worth, and could well end up being
73		a slowdown for small e. For big e on the other hand the algorithm is
74		dominated by mpn_sqr so there wouldn't much of a saving. The current
75		code can be viewed as simply doing the first few steps of the powering in
76		a single or double limb where possible.
77
78		If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79		copy made of b is unnecessary. We could just use the old alloc'ed block
80		and free it at the end. But arranging this seems like a lot more trouble
81		than it's worth. */
82
83
84		/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85		a limb without overflowing.
86		FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88	477	#define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91		/* The following are for convenience, they update the size and check the
92		alloc. */
93
94		#define MPN_SQR(dst, alloc, src, size) \
95	52	do { \
96	52	ASSERT (2*(size) <= (alloc)); \
97	52	mpn_sqr (dst, src, size); \
98	52	(size) *= 2; \
99	52	(size) -= ((dst)[(size)-1] == 0); \
100	52	} while (0)
101
102		#define MPN_MUL(dst, alloc, src, size, src2, size2) \
103	0	do { \
104	0	mp_limb_t cy; \
105	0	ASSERT ((size) + (size2) <= (alloc)); \
106	0	cy = mpn_mul (dst, src, size, src2, size2); \
107	0	(size) += (size2) - (cy == 0); \
108	0	} while (0)
109
110		#define MPN_MUL_2(ptr, size, alloc, mult) \
111	45	do { \
112	45	mp_limb_t cy; \
113	45	ASSERT ((size)+2 <= (alloc)); \
114	45	cy = mpn_mul_2 (ptr, ptr, size, mult); \
115	45	(size)++; \
116	45	(ptr)[(size)] = cy; \
117	45	(size) += (cy != 0); \
118	45	} while (0)
119
120		#define MPN_MUL_1(ptr, size, alloc, limb) \
121	38	do { \
122	38	mp_limb_t cy; \
123	38	ASSERT ((size)+1 <= (alloc)); \
124	38	cy = mpn_mul_1 (ptr, ptr, size, limb); \
125	38	(ptr)[size] = cy; \
126	38	(size) += (cy != 0); \
127	38	} while (0)
128
129		#define MPN_LSHIFT(ptr, size, alloc, shift) \
130	53	do { \
131	53	mp_limb_t cy; \
132	53	ASSERT ((size)+1 <= (alloc)); \
133	53	cy = mpn_lshift (ptr, ptr, size, shift); \
134	53	(ptr)[size] = cy; \
135	53	(size) += (cy != 0); \
136	53	} while (0)
137
138		#define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
139	0	do { \
140	0	if ((shift) == 0) \
141	0	MPN_COPY (dst, src, size); \
142	0	else \
143	0	{ \
144	0	mpn_rshift (dst, src, size, shift); \
145	0	(size) -= ((dst)[(size)-1] == 0); \
146	0	} \
147	0	} while (0)
148
149
150		/* ralloc and talloc are only wanted for ASSERTs, after the initial space
151		allocations. Avoid writing values to them in a normal build, to ensure
152		the compiler lets them go dead. gcc already figures this out itself
153		actually. */
154
155		#define SWAP_RP_TP \
156	70	do { \
157	70	MP_PTR_SWAP (rp, tp); \
158	70	ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
159	70	} while (0)
160
161
162		void
163		mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164	108	{
165	108	mp_ptr rp;
166	108	mp_size_t rtwos_limbs, ralloc, rsize;
167	108	int rneg, i, cnt, btwos, r_bp_overlap;
168	108	mp_limb_t blimb, rl;
169	108	mp_bitcnt_t rtwos_bits;
170	108	#if HAVE_NATIVE_mpn_mul_2
171	108	mp_limb_t blimb_low, rl_high;
172		#else
173		mp_limb_t b_twolimbs[2];
174		#endif
175	108	mp_limb_t ovfl;
176	108	TMP_DECL;
177
178	108	TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179	108	PTR(r), bp, bsize, e, e);
180	108	mpn_trace ("b", bp, bsize));
181
182	108	ASSERT (bsize == 0 \|\| bp[ABS(bsize)-1] != 0);
183	108	ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185		/* b^0 == 1, including 0^0 == 1 */
186	108	if (e == 0)
187	3	{
188	3	MPZ_NEWALLOC (r, 1)[0] = 1;
189	3	SIZ(r) = 1;
190	3	return;
191	3	}
192
193		/* 0^e == 0 apart from 0^0 above */
194	105	if (bsize == 0)
195	3	{
196	3	SIZ(r) = 0;
197	3	return;
198	3	}
199
200		/* Sign of the final result. */
201	102	rneg = (bsize < 0 && (e & 1) != 0);
202	102	bsize = ABS (bsize);
203	102	TRACE (printf ("rneg %d\n", rneg));
204
205	102	r_bp_overlap = (PTR(r) == bp);
206
207		/* Strip low zero limbs from b. */
208	102	rtwos_limbs = 0;
209	102	for (blimb = bp; blimb == 0; blimb = ++bp)
210	0	{
211	0	rtwos_limbs += e;
212	0	bsize--; ASSERT (bsize >= 1);
213	0	}
214	102	TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216		/* Strip low zero bits from b. */
217	102	count_trailing_zeros (btwos, blimb);
218	102	blimb >>= btwos;
219
220	102	umul_ppmm (ovfl, rtwos_bits, e, btwos);
221	102	if (ovfl)
222	0	{
223	0	fprintf (stderr, "gmp: overflow in mpz type\n");
224	0	abort ();
225	0	}
226
227	102	rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228	102	rtwos_bits %= GMP_NUMB_BITS;
229	102	TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230	102	btwos, rtwos_limbs, rtwos_bits));
231
232	102	TMP_MARK;
233
234	102	rl = 1;
235	102	#if HAVE_NATIVE_mpn_mul_2
236	102	rl_high = 0;
237	102	#endif
238
239	102	if (bsize == 1)
240	102	{
241	102	bsize_1:
242		/* Power up as far as possible within blimb. We start here with e!=0,
243		but if e is small then we might reach e==0 and the whole b^e in rl.
244		Notice this code works when blimb==1 too, reaching e==0. */
245
246	477	while (blimb <= GMP_NUMB_HALFMAX)
247	389	{
248	389	TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249	389	e, blimb, rl));
250	389	ASSERT (e != 0);
251	389	if ((e & 1) != 0)
252	208	rl *= blimb;
253	389	e >>= 1;
254	389	if (e == 0)
255	14	goto got_rl;
256	375	blimb *= blimb;
257	375	}
258
259	88	#if HAVE_NATIVE_mpn_mul_2
260	88	TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261	88	e, blimb, rl));
262
263		/* Can power b once more into blimb:blimb_low */
264	88	bsize = 2;
265	88	ASSERT (e != 0);
266	88	if ((e & 1) != 0)
267	52	{
268	52	umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269	52	rl >>= GMP_NAIL_BITS;
270	52	}
271	88	e >>= 1;
272	88	umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273	88	blimb_low >>= GMP_NAIL_BITS;
274
275	102	got_rl:
276	102	TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277	102	e, blimb, blimb_low, rl_high, rl));
278
279		/* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280		final mul_1 or mul_2 rather than a separate lshift.
281		- rl_high:rl mustn't be 1 (since then there's no final mul)
282		- rl_high mustn't overflow
283		- rl_high mustn't change to non-zero, since mul_1+lshift is
284		probably faster than mul_2 (FIXME: is this true?) */
285
286	102	if (rtwos_bits != 0
287	102	&& ! (rl_high == 0 && rl == 1)
288	102	&& (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289	52	{
290	52	mp_limb_t new_rl_high = (rl_high << rtwos_bits)
291	52	\| (rl >> (GMP_NUMB_BITS-rtwos_bits));
292	52	if (! (rl_high == 0 && new_rl_high != 0))
293	30	{
294	30	rl_high = new_rl_high;
295	30	rl <<= rtwos_bits;
296	30	rtwos_bits = 0;
297	30	TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298	30	rl_high, rl));
299	30	}
300	52	}
301		#else
302		got_rl:
303		TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304		e, blimb, rl));
305
306		/* Combine left-over rtwos_bits into rl to be handled by the final
307		mul_1 rather than a separate lshift.
308		- rl mustn't be 1 (since then there's no final mul)
309		- rl mustn't overflow */
310
311		if (rtwos_bits != 0
312		&& rl != 1
313		&& (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314		{
315		rl <<= rtwos_bits;
316		rtwos_bits = 0;
317		TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318		}
319		#endif
320	102	}
321	0	else if (bsize == 2)
322	0	{
323	0	mp_limb_t bsecond = bp[1];
324	0	if (btwos != 0)
325	0	blimb \|= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326	0	bsecond >>= btwos;
327	0	if (bsecond == 0)
328	0	{
329		/* Two limbs became one after rshift. */
330	0	bsize = 1;
331	0	goto bsize_1;
332	0	}
333
334	0	TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335	0	#if HAVE_NATIVE_mpn_mul_2
336	0	blimb_low = blimb;
337		#else
338		bp = b_twolimbs;
339		b_twolimbs[0] = blimb;
340		b_twolimbs[1] = bsecond;
341		#endif
342	0	blimb = bsecond;
343	0	}
344	0	else
345	0	{
346	0	if (r_bp_overlap \|\| btwos != 0)
347	0	{
348	0	mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349	0	MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350	0	bp = tp;
351	0	TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352	0	}
353	0	#if HAVE_NATIVE_mpn_mul_2
354		/* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355	0	blimb_low = bp[0];
356	0	#endif
357	0	blimb = bp[bsize-1];
358
359	0	TRACE (printf ("big bsize=%ld ", bsize);
360	0	mpn_trace ("b", bp, bsize));
361	0	}
362
363		/* At this point blimb is the most significant limb of the base to use.
364
365		Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366		limb to round up the division; +1 for multiplies all using an extra
367		limb over the true size; +2 for rl at the end; +1 for lshift at the
368		end.
369
370		The size calculation here is reasonably accurate. The base is at least
371		half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372		when it will power up as just over 16, an overestimate of 17/16 =
373		6.25%. For a 64-bit limb it's half that.
374
375		If e==0 then blimb won't be anything useful (though it will be
376		non-zero), but that doesn't matter since we just end up with ralloc==5,
377		and that's fine for 2 limbs of rl and 1 of lshift. */
378
379	102	ASSERT (blimb != 0);
380	102	count_leading_zeros (cnt, blimb);
381
382	102	umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383	102	if (ovfl)
384	0	{
385	0	fprintf (stderr, "gmp: overflow in mpz type\n");
386	0	abort ();
387	0	}
388	102	ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390	102	TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391	102	ralloc, bsize, blimb, cnt));
392	102	rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394		/* Low zero limbs resulting from powers of 2. */
395	102	MPN_ZERO (rp, rtwos_limbs);
396	102	rp += rtwos_limbs;
397
398	102	if (e == 0)
399	29	{
400		/* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401		start. */
402	29	rp[0] = rl;
403	29	rsize = 1;
404	29	#if HAVE_NATIVE_mpn_mul_2
405	29	rp[1] = rl_high;
406	29	rsize += (rl_high != 0);
407	29	#endif
408	29	ASSERT (rp[rsize-1] != 0);
409	29	}
410	73	else
411	73	{
412	73	mp_ptr tp;
413	73	mp_size_t talloc;
414
415		/* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416		low bit of e is zero, tp only has to hold the second last power
417		step, which is half the size of the final result. There's no need
418		to round up the divide by 2, since ralloc includes a +2 for rl
419		which not needed by tp. In the mpn_mul loop when the low bit of e
420		is 1, tp must hold nearly the full result, so just size it the same
421		as rp. */
422
423	73	talloc = ralloc;
424	73	#if HAVE_NATIVE_mpn_mul_2
425	73	if (bsize <= 2 \|\| (e & 1) == 0)
426	73	talloc /= 2;
427		#else
428		if (bsize <= 1 \|\| (e & 1) == 0)
429		talloc /= 2;
430		#endif
431	73	TRACE (printf ("talloc %ld\n", talloc));
432	73	tp = TMP_ALLOC_LIMBS (talloc);
433
434		/* Go from high to low over the bits of e, starting with i pointing at
435		the bit below the highest 1 (which will mean i==-1 if e==1). */
436	73	count_leading_zeros (cnt, (mp_limb_t) e);
437	73	i = GMP_LIMB_BITS - cnt - 2;
438
439	73	#if HAVE_NATIVE_mpn_mul_2
440	73	if (bsize <= 2)
441	73	{
442	73	mp_limb_t mult[2];
443
444		/* Any bsize==1 will have been powered above to be two limbs. */
445	73	ASSERT (bsize == 2);
446	73	ASSERT (blimb != 0);
447
448		/* Arrange the final result ends up in r, not in the temp space */
449	73	if ((i & 1) == 0)
450	18	SWAP_RP_TP;
451
452	73	rp[0] = blimb_low;
453	73	rp[1] = blimb;
454	73	rsize = 2;
455
456	73	mult[0] = blimb_low;
457	73	mult[1] = blimb;
458
459	125	for ( ; i >= 0; i--)
460	52	{
461	52	TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462	52	i, e, rsize, ralloc, talloc);
463	52	mpn_trace ("r", rp, rsize));
464
465	52	MPN_SQR (tp, talloc, rp, rsize);
466	52	SWAP_RP_TP;
467	52	if ((e & (1L << i)) != 0)
468	16	MPN_MUL_2 (rp, rsize, ralloc, mult);
469	52	}
470
471	73	TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472	73	if (rl_high != 0)
473	29	{
474	29	mult[0] = rl;
475	29	mult[1] = rl_high;
476	29	MPN_MUL_2 (rp, rsize, ralloc, mult);
477	29	}
478	44	else if (rl != 1)
479	38	MPN_MUL_1 (rp, rsize, ralloc, rl);
480	73	}
481		#else
482		if (bsize == 1)
483		{
484		/* Arrange the final result ends up in r, not in the temp space */
485		if ((i & 1) == 0)
486		SWAP_RP_TP;
487
488		rp[0] = blimb;
489		rsize = 1;
490
491		for ( ; i >= 0; i--)
492		{
493		TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494		i, e, rsize, ralloc, talloc);
495		mpn_trace ("r", rp, rsize));
496
497		MPN_SQR (tp, talloc, rp, rsize);
498		SWAP_RP_TP;
499		if ((e & (1L << i)) != 0)
500		MPN_MUL_1 (rp, rsize, ralloc, blimb);
501		}
502
503		TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504		if (rl != 1)
505		MPN_MUL_1 (rp, rsize, ralloc, rl);
506		}
507		#endif
508	0	else
509	0	{
510	0	int parity;
511
512		/* Arrange the final result ends up in r, not in the temp space */
513	0	ULONG_PARITY (parity, e);
514	0	if (((parity ^ i) & 1) != 0)
515	0	SWAP_RP_TP;
516
517	0	MPN_COPY (rp, bp, bsize);
518	0	rsize = bsize;
519
520	0	for ( ; i >= 0; i--)
521	0	{
522	0	TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523	0	i, e, rsize, ralloc, talloc);
524	0	mpn_trace ("r", rp, rsize));
525
526	0	MPN_SQR (tp, talloc, rp, rsize);
527	0	SWAP_RP_TP;
528	0	if ((e & (1L << i)) != 0)
529	0	{
530	0	MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531	0	SWAP_RP_TP;
532	0	}
533	0	}
534	0	}
535	73	}
536
537	102	ASSERT (rp == PTR(r) + rtwos_limbs);
538	102	TRACE (mpn_trace ("end loop r", rp, rsize));
539	102	TMP_FREE;
540
541		/* Apply any partial limb factors of 2. */
542	102	if (rtwos_bits != 0)
543	53	{
544	53	MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545	53	TRACE (mpn_trace ("lshift r", rp, rsize));
546	53	}
547
548	102	rsize += rtwos_limbs;
549	102	SIZ(r) = (rneg ? -rsize : rsize);
550	102	}