/src/libgmp/mpz/n_pow_ui.c

Source (jump to first uncovered line)
/* mpz_n_pow_ui -- mpn raised to ulong.

   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   FUTURE GNU MP RELEASES.

Copyright 2001, 2002, 2005, 2012, 2015 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp-impl.h"
#include "longlong.h"


/* Change this to "#define TRACE(x) x" for some traces. */
#define TRACE(x)


/* Use this to test the mul_2 code on a CPU without a native version of that
   routine.  */
#if 0
#define mpn_mul_2  refmpn_mul_2
#define HAVE_NATIVE_mpn_mul_2  1
#endif


/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
   bsize==2 or >2, but separating that isn't easy because there's shared
   code both before and after (the size calculations and the powers of 2
   handling).

   Alternatives:

   It would work to just use the mpn_mul powering loop for 1 and 2 limb
   bases, but the current separate loop allows mul_1 and mul_2 to be done
   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
   to allow source==dest when vn==1 or 2 then some pointer twiddling might
   let us get the same effect in one loop.

   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
   form the biggest possible power of b that fits, only the biggest power of
   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
   using mp_bases[b].big_base for small b, and thereby get better value
   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
   so would be more complicated than it's worth, and could well end up being
   a slowdown for small e.  For big e on the other hand the algorithm is
   dominated by mpn_sqr so there wouldn't much of a saving.  The current
   code can be viewed as simply doing the first few steps of the powering in
   a single or double limb where possible.

   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
   copy made of b is unnecessary.  We could just use the old alloc'ed block
   and free it at the end.  But arranging this seems like a lot more trouble
   than it's worth.  */


/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
   a limb without overflowing.
   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */

#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)


/* The following are for convenience, they update the size and check the
   alloc.  */

#define MPN_SQR(dst, alloc, src, size)          \
  do {                                          \
    ASSERT (2*(size) <= (alloc));               \
    mpn_sqr (dst, src, size);                   \
    (size) *= 2;                                \
    (size) -= ((dst)[(size)-1] == 0);           \
  } while (0)

#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
  do {                                                  \
    mp_limb_t  cy;                                      \
    ASSERT ((size) + (size2) <= (alloc));               \
    cy = mpn_mul (dst, src, size, src2, size2);         \
    (size) += (size2) - (cy == 0);                      \
  } while (0)

#define MPN_MUL_2(ptr, size, alloc, mult)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+2 <= (alloc));               \
    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
    (size)++;                                   \
    (ptr)[(size)] = cy;                         \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_MUL_1(ptr, size, alloc, limb)       \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_LSHIFT(ptr, size, alloc, shift)     \
  do {                                          \
    mp_limb_t  cy;                              \
    ASSERT ((size)+1 <= (alloc));               \
    cy = mpn_lshift (ptr, ptr, size, shift);    \
    (ptr)[size] = cy;                           \
    (size) += (cy != 0);                        \
  } while (0)

#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
  do {                                                  \
    if ((shift) == 0)                                   \
      MPN_COPY (dst, src, size);                        \
    else                                                \
      {                                                 \
        mpn_rshift (dst, src, size, shift);             \
        (size) -= ((dst)[(size)-1] == 0);               \
      }                                                 \
  } while (0)


/* ralloc and talloc are only wanted for ASSERTs, after the initial space
   allocations.  Avoid writing values to them in a normal build, to ensure
   the compiler lets them go dead.  gcc already figures this out itself
   actually.  */

#define SWAP_RP_TP                                      \
  do {                                                  \
    MP_PTR_SWAP (rp, tp);                               \
    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
  } while (0)


void
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
{
  mp_ptr         rp;
  mp_size_t      rtwos_limbs, ralloc, rsize;
  int            rneg, i, cnt, btwos, r_bp_overlap;
  mp_limb_t      blimb, rl;
  mp_bitcnt_t    rtwos_bits;
#if HAVE_NATIVE_mpn_mul_2
  mp_limb_t      blimb_low, rl_high;
#else
  mp_limb_t      b_twolimbs[2];
#endif
  TMP_DECL;

  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
     PTR(r), bp, bsize, e, e);
   mpn_trace ("b", bp, bsize));

  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));

  /* b^0 == 1, including 0^0 == 1 */
  if (e == 0)
    {
      MPZ_NEWALLOC (r, 1)[0] = 1;
      SIZ(r) = 1;
      return;
    }

  /* 0^e == 0 apart from 0^0 above */
  if (bsize == 0)
    {
      SIZ(r) = 0;
      return;
    }

  /* Sign of the final result. */
  rneg = (bsize < 0 && (e & 1) != 0);
  bsize = ABS (bsize);
  TRACE (printf ("rneg %d\n", rneg));

  r_bp_overlap = (PTR(r) == bp);

  /* Strip low zero limbs from b. */
  rtwos_limbs = 0;
  for (blimb = *bp; blimb == 0; blimb = *++bp)
    {
      rtwos_limbs += e;
      bsize--; ASSERT (bsize >= 1);
    }
  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));

  /* Strip low zero bits from b. */
  count_trailing_zeros (btwos, blimb);
  blimb >>= btwos;
  rtwos_bits = e * btwos;
  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
  rtwos_bits %= GMP_NUMB_BITS;
  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
     btwos, rtwos_limbs, rtwos_bits));

  TMP_MARK;

  rl = 1;
#if HAVE_NATIVE_mpn_mul_2
  rl_high = 0;
#endif

  if (bsize == 1)
    {
    bsize_1:
      /* Power up as far as possible within blimb.  We start here with e!=0,
   but if e is small then we might reach e==0 and the whole b^e in rl.
   Notice this code works when blimb==1 too, reaching e==0.  */

      while (blimb <= GMP_NUMB_HALFMAX)
  {
    TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
       e, blimb, rl));
    ASSERT (e != 0);
    if ((e & 1) != 0)
      rl *= blimb;
    e >>= 1;
    if (e == 0)
      goto got_rl;
    blimb *= blimb;
  }

#if HAVE_NATIVE_mpn_mul_2
      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Can power b once more into blimb:blimb_low */
      bsize = 2;
      ASSERT (e != 0);
      if ((e & 1) != 0)
  {
    umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
    rl >>= GMP_NAIL_BITS;
  }
      e >>= 1;
      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
      blimb_low >>= GMP_NAIL_BITS;

    got_rl:
      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
         e, blimb, blimb_low, rl_high, rl));

      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
   final mul_1 or mul_2 rather than a separate lshift.
   - rl_high:rl mustn't be 1 (since then there's no final mul)
   - rl_high mustn't overflow
   - rl_high mustn't change to non-zero, since mul_1+lshift is
   probably faster than mul_2 (FIXME: is this true?)  */

      if (rtwos_bits != 0
    && ! (rl_high == 0 && rl == 1)
    && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
      | (rl >> (GMP_NUMB_BITS-rtwos_bits));
    if (! (rl_high == 0 && new_rl_high != 0))
      {
        rl_high = new_rl_high;
        rl <<= rtwos_bits;
        rtwos_bits = 0;
        TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
           rl_high, rl));
      }
  }
#else
    got_rl:
      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
         e, blimb, rl));

      /* Combine left-over rtwos_bits into rl to be handled by the final
   mul_1 rather than a separate lshift.
   - rl mustn't be 1 (since then there's no final mul)
   - rl mustn't overflow  */

      if (rtwos_bits != 0
    && rl != 1
    && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
  {
    rl <<= rtwos_bits;
    rtwos_bits = 0;
    TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
  }
#endif
    }
  else if (bsize == 2)
    {
      mp_limb_t  bsecond = bp[1];
      if (btwos != 0)
  blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
      bsecond >>= btwos;
      if (bsecond == 0)
  {
    /* Two limbs became one after rshift. */
    bsize = 1;
    goto bsize_1;
  }

      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
#if HAVE_NATIVE_mpn_mul_2
      blimb_low = blimb;
#else
      bp = b_twolimbs;
      b_twolimbs[0] = blimb;
      b_twolimbs[1] = bsecond;
#endif
      blimb = bsecond;
    }
  else
    {
      if (r_bp_overlap || btwos != 0)
  {
    mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
    MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
    bp = tp;
    TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
  }
#if HAVE_NATIVE_mpn_mul_2
      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
      blimb_low = bp[0];
#endif
      blimb = bp[bsize-1];

      TRACE (printf ("big bsize=%ld  ", bsize);
       mpn_trace ("b", bp, bsize));
    }

  /* At this point blimb is the most significant limb of the base to use.

     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
     limb to round up the division; +1 for multiplies all using an extra
     limb over the true size; +2 for rl at the end; +1 for lshift at the
     end.

     The size calculation here is reasonably accurate.  The base is at least
     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
     when it will power up as just over 16, an overestimate of 17/16 =
     6.25%.  For a 64-bit limb it's half that.

     If e==0 then blimb won't be anything useful (though it will be
     non-zero), but that doesn't matter since we just end up with ralloc==5,
     and that's fine for 2 limbs of rl and 1 of lshift.  */

  ASSERT (blimb != 0);
  count_leading_zeros (cnt, blimb);
  ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
     ralloc, bsize, blimb, cnt));
  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);

  /* Low zero limbs resulting from powers of 2. */
  MPN_ZERO (rp, rtwos_limbs);
  rp += rtwos_limbs;

  if (e == 0)
    {
      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
   start. */
      rp[0] = rl;
      rsize = 1;
#if HAVE_NATIVE_mpn_mul_2
      rp[1] = rl_high;
      rsize += (rl_high != 0);
#endif
      ASSERT (rp[rsize-1] != 0);
    }
  else
    {
      mp_ptr     tp;
      mp_size_t  talloc;

      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
   low bit of e is zero, tp only has to hold the second last power
   step, which is half the size of the final result.  There's no need
   to round up the divide by 2, since ralloc includes a +2 for rl
   which not needed by tp.  In the mpn_mul loop when the low bit of e
   is 1, tp must hold nearly the full result, so just size it the same
   as rp.  */

      talloc = ralloc;
#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2 || (e & 1) == 0)
  talloc /= 2;
#else
      if (bsize <= 1 || (e & 1) == 0)
  talloc /= 2;
#endif
      TRACE (printf ("talloc %ld\n", talloc));
      tp = TMP_ALLOC_LIMBS (talloc);

      /* Go from high to low over the bits of e, starting with i pointing at
   the bit below the highest 1 (which will mean i==-1 if e==1).  */
      count_leading_zeros (cnt, (mp_limb_t) e);
      i = GMP_LIMB_BITS - cnt - 2;

#if HAVE_NATIVE_mpn_mul_2
      if (bsize <= 2)
  {
    mp_limb_t  mult[2];

    /* Any bsize==1 will have been powered above to be two limbs. */
    ASSERT (bsize == 2);
    ASSERT (blimb != 0);

    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb_low;
    rp[1] = blimb;
    rsize = 2;

    mult[0] = blimb_low;
    mult[1] = blimb;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_2 (rp, rsize, ralloc, mult);
      }

    TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
    if (rl_high != 0)
      {
        mult[0] = rl;
        mult[1] = rl_high;
        MPN_MUL_2 (rp, rsize, ralloc, mult);
      }
    else if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#else
      if (bsize == 1)
  {
    /* Arrange the final result ends up in r, not in the temp space */
    if ((i & 1) == 0)
      SWAP_RP_TP;

    rp[0] = blimb;
    rsize = 1;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    MPN_MUL_1 (rp, rsize, ralloc, blimb);
      }

    TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
    if (rl != 1)
      MPN_MUL_1 (rp, rsize, ralloc, rl);
  }
#endif
      else
  {
    int  parity;

    /* Arrange the final result ends up in r, not in the temp space */
    ULONG_PARITY (parity, e);
    if (((parity ^ i) & 1) != 0)
      SWAP_RP_TP;

    MPN_COPY (rp, bp, bsize);
    rsize = bsize;

    for ( ; i >= 0; i--)
      {
        TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
           i, e, rsize, ralloc, talloc);
         mpn_trace ("r", rp, rsize));

        MPN_SQR (tp, talloc, rp, rsize);
        SWAP_RP_TP;
        if ((e & (1L << i)) != 0)
    {
      MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
      SWAP_RP_TP;
    }
      }
  }
    }

  ASSERT (rp == PTR(r) + rtwos_limbs);
  TRACE (mpn_trace ("end loop r", rp, rsize));
  TMP_FREE;

  /* Apply any partial limb factors of 2. */
  if (rtwos_bits != 0)
    {
      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
      TRACE (mpn_trace ("lshift r", rp, rsize));
    }

  rsize += rtwos_limbs;
  SIZ(r) = (rneg ? -rsize : rsize);
}

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		/* mpz_n_pow_ui -- mpn raised to ulong.
2
3		THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4		CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5		FUTURE GNU MP RELEASES.
6
7		Copyright 2001, 2002, 2005, 2012, 2015 Free Software Foundation, Inc.
8
9		This file is part of the GNU MP Library.
10
11		The GNU MP Library is free software; you can redistribute it and/or modify
12		it under the terms of either:
13
14		* the GNU Lesser General Public License as published by the Free
15		Software Foundation; either version 3 of the License, or (at your
16		option) any later version.
17
18		or
19
20		* the GNU General Public License as published by the Free Software
21		Foundation; either version 2 of the License, or (at your option) any
22		later version.
23
24		or both in parallel, as here.
25
26		The GNU MP Library is distributed in the hope that it will be useful, but
27		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29		for more details.
30
31		You should have received copies of the GNU General Public License and the
32		GNU Lesser General Public License along with the GNU MP Library. If not,
33		see https://www.gnu.org/licenses/. */
34
35		#include "gmp-impl.h"
36		#include "longlong.h"
37
38
39		/* Change this to "#define TRACE(x) x" for some traces. */
40		#define TRACE(x)
41
42
43		/* Use this to test the mul_2 code on a CPU without a native version of that
44		routine. */
45		#if 0
46		#define mpn_mul_2 refmpn_mul_2
47		#define HAVE_NATIVE_mpn_mul_2 1
48		#endif
49
50
51		/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
52		ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
53		bsize==2 or >2, but separating that isn't easy because there's shared
54		code both before and after (the size calculations and the powers of 2
55		handling).
56
57		Alternatives:
58
59		It would work to just use the mpn_mul powering loop for 1 and 2 limb
60		bases, but the current separate loop allows mul_1 and mul_2 to be done
61		in-place, which might help cache locality a bit. If mpn_mul was relaxed
62		to allow source==dest when vn==1 or 2 then some pointer twiddling might
63		let us get the same effect in one loop.
64
65		The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
66		form the biggest possible power of b that fits, only the biggest power of
67		2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
68		using mp_bases[b].big_base for small b, and thereby get better value
69		from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
70		so would be more complicated than it's worth, and could well end up being
71		a slowdown for small e. For big e on the other hand the algorithm is
72		dominated by mpn_sqr so there wouldn't much of a saving. The current
73		code can be viewed as simply doing the first few steps of the powering in
74		a single or double limb where possible.
75
76		If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
77		copy made of b is unnecessary. We could just use the old alloc'ed block
78		and free it at the end. But arranging this seems like a lot more trouble
79		than it's worth. */
80
81
82		/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
83		a limb without overflowing.
84		FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
85
86	3.75k	#define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
87
88
89		/* The following are for convenience, they update the size and check the
90		alloc. */
91
92		#define MPN_SQR(dst, alloc, src, size) \
93	0	do { \
94	0	ASSERT (2*(size) <= (alloc)); \
95	0	mpn_sqr (dst, src, size); \
96	0	(size) *= 2; \
97	0	(size) -= ((dst)[(size)-1] == 0); \
98	0	} while (0)
99
100		#define MPN_MUL(dst, alloc, src, size, src2, size2) \
101	0	do { \
102	0	mp_limb_t cy; \
103	0	ASSERT ((size) + (size2) <= (alloc)); \
104	0	cy = mpn_mul (dst, src, size, src2, size2); \
105	0	(size) += (size2) - (cy == 0); \
106	0	} while (0)
107
108		#define MPN_MUL_2(ptr, size, alloc, mult) \
109	0	do { \
110	0	mp_limb_t cy; \
111	0	ASSERT ((size)+2 <= (alloc)); \
112	0	cy = mpn_mul_2 (ptr, ptr, size, mult); \
113	0	(size)++; \
114	0	(ptr)[(size)] = cy; \
115	0	(size) += (cy != 0); \
116	0	} while (0)
117
118		#define MPN_MUL_1(ptr, size, alloc, limb) \
119	0	do { \
120	0	mp_limb_t cy; \
121	0	ASSERT ((size)+1 <= (alloc)); \
122	0	cy = mpn_mul_1 (ptr, ptr, size, limb); \
123	0	(ptr)[size] = cy; \
124	0	(size) += (cy != 0); \
125	0	} while (0)
126
127		#define MPN_LSHIFT(ptr, size, alloc, shift) \
128	9	do { \
129	9	mp_limb_t cy; \
130	9	ASSERT ((size)+1 <= (alloc)); \
131	9	cy = mpn_lshift (ptr, ptr, size, shift); \
132	9	(ptr)[size] = cy; \
133	9	(size) += (cy != 0); \
134	9	} while (0)
135
136		#define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
137	0	do { \
138	0	if ((shift) == 0) \
139	0	MPN_COPY (dst, src, size); \
140	0	else \
141	0	{ \
142	0	mpn_rshift (dst, src, size, shift); \
143	0	(size) -= ((dst)[(size)-1] == 0); \
144	0	} \
145	0	} while (0)
146
147
148		/* ralloc and talloc are only wanted for ASSERTs, after the initial space
149		allocations. Avoid writing values to them in a normal build, to ensure
150		the compiler lets them go dead. gcc already figures this out itself
151		actually. */
152
153		#define SWAP_RP_TP \
154	0	do { \
155	0	MP_PTR_SWAP (rp, tp); \
156	0	ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
157	0	} while (0)
158
159
160		void
161		mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
162	1.25k	{
163	1.25k	mp_ptr rp;
164	1.25k	mp_size_t rtwos_limbs, ralloc, rsize;
165	1.25k	int rneg, i, cnt, btwos, r_bp_overlap;
166	1.25k	mp_limb_t blimb, rl;
167	1.25k	mp_bitcnt_t rtwos_bits;
168	1.25k	#if HAVE_NATIVE_mpn_mul_2
169	1.25k	mp_limb_t blimb_low, rl_high;
170		#else
171		mp_limb_t b_twolimbs[2];
172		#endif
173	1.25k	TMP_DECL;
174
175	1.25k	TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
176	1.25k	PTR(r), bp, bsize, e, e);
177	1.25k	mpn_trace ("b", bp, bsize));
178
179	1.25k	ASSERT (bsize == 0 \|\| bp[ABS(bsize)-1] != 0);
180	1.25k	ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
181
182		/* b^0 == 1, including 0^0 == 1 */
183	1.25k	if (e == 0)
184	1	{
185	1	MPZ_NEWALLOC (r, 1)[0] = 1;
186	1	SIZ(r) = 1;
187	1	return;
188	1	}
189
190		/* 0^e == 0 apart from 0^0 above */
191	1.24k	if (bsize == 0)
192	0	{
193	0	SIZ(r) = 0;
194	0	return;
195	0	}
196
197		/* Sign of the final result. */
198	1.24k	rneg = (bsize < 0 && (e & 1) != 0);
199	1.24k	bsize = ABS (bsize);
200	1.24k	TRACE (printf ("rneg %d\n", rneg));
201
202	1.24k	r_bp_overlap = (PTR(r) == bp);
203
204		/* Strip low zero limbs from b. */
205	1.24k	rtwos_limbs = 0;
206	1.24k	for (blimb = bp; blimb == 0; blimb = ++bp)
207	0	{
208	0	rtwos_limbs += e;
209	0	bsize--; ASSERT (bsize >= 1);
210	0	}
211	1.24k	TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
212
213		/* Strip low zero bits from b. */
214	1.24k	count_trailing_zeros (btwos, blimb);
215	1.24k	blimb >>= btwos;
216	1.24k	rtwos_bits = e * btwos;
217	1.24k	rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
218	1.24k	rtwos_bits %= GMP_NUMB_BITS;
219	1.24k	TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
220	1.24k	btwos, rtwos_limbs, rtwos_bits));
221
222	1.24k	TMP_MARK;
223
224	1.24k	rl = 1;
225	1.24k	#if HAVE_NATIVE_mpn_mul_2
226	1.24k	rl_high = 0;
227	1.24k	#endif
228
229	1.24k	if (bsize == 1)
230	1.24k	{
231	1.24k	bsize_1:
232		/* Power up as far as possible within blimb. We start here with e!=0,
233		but if e is small then we might reach e==0 and the whole b^e in rl.
234		Notice this code works when blimb==1 too, reaching e==0. */
235
236	3.75k	while (blimb <= GMP_NUMB_HALFMAX)
237	3.18k	{
238	3.18k	TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
239	3.18k	e, blimb, rl));
240	3.18k	ASSERT (e != 0);
241	3.18k	if ((e & 1) != 0)
242	1.93k	rl *= blimb;
243	3.18k	e >>= 1;
244	3.18k	if (e == 0)
245	680	goto got_rl;
246	2.50k	blimb *= blimb;
247	2.50k	}
248
249	569	#if HAVE_NATIVE_mpn_mul_2
250	569	TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
251	569	e, blimb, rl));
252
253		/* Can power b once more into blimb:blimb_low */
254	569	bsize = 2;
255	569	ASSERT (e != 0);
256	569	if ((e & 1) != 0)
257	569	{
258	569	umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
259	569	rl >>= GMP_NAIL_BITS;
260	569	}
261	569	e >>= 1;
262	569	umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
263	569	blimb_low >>= GMP_NAIL_BITS;
264
265	1.24k	got_rl:
266	1.24k	TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
267	1.24k	e, blimb, blimb_low, rl_high, rl));
268
269		/* Combine left-over rtwos_bits into rl_high:rl to be handled by the
270		final mul_1 or mul_2 rather than a separate lshift.
271		- rl_high:rl mustn't be 1 (since then there's no final mul)
272		- rl_high mustn't overflow
273		- rl_high mustn't change to non-zero, since mul_1+lshift is
274		probably faster than mul_2 (FIXME: is this true?) */
275
276	1.24k	if (rtwos_bits != 0
277	1.24k	&& ! (rl_high == 0 && rl == 1)
278	1.24k	&& (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
279	617	{
280	617	mp_limb_t new_rl_high = (rl_high << rtwos_bits)
281	617	\| (rl >> (GMP_NUMB_BITS-rtwos_bits));
282	617	if (! (rl_high == 0 && new_rl_high != 0))
283	617	{
284	617	rl_high = new_rl_high;
285	617	rl <<= rtwos_bits;
286	617	rtwos_bits = 0;
287	617	TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
288	617	rl_high, rl));
289	617	}
290	617	}
291		#else
292		got_rl:
293		TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
294		e, blimb, rl));
295
296		/* Combine left-over rtwos_bits into rl to be handled by the final
297		mul_1 rather than a separate lshift.
298		- rl mustn't be 1 (since then there's no final mul)
299		- rl mustn't overflow */
300
301		if (rtwos_bits != 0
302		&& rl != 1
303		&& (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
304		{
305		rl <<= rtwos_bits;
306		rtwos_bits = 0;
307		TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
308		}
309		#endif
310	1.24k	}
311	0	else if (bsize == 2)
312	0	{
313	0	mp_limb_t bsecond = bp[1];
314	0	if (btwos != 0)
315	0	blimb \|= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
316	0	bsecond >>= btwos;
317	0	if (bsecond == 0)
318	0	{
319		/* Two limbs became one after rshift. */
320	0	bsize = 1;
321	0	goto bsize_1;
322	0	}
323
324	0	TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
325	0	#if HAVE_NATIVE_mpn_mul_2
326	0	blimb_low = blimb;
327		#else
328		bp = b_twolimbs;
329		b_twolimbs[0] = blimb;
330		b_twolimbs[1] = bsecond;
331		#endif
332	0	blimb = bsecond;
333	0	}
334	0	else
335	0	{
336	0	if (r_bp_overlap \|\| btwos != 0)
337	0	{
338	0	mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
339	0	MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
340	0	bp = tp;
341	0	TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
342	0	}
343	0	#if HAVE_NATIVE_mpn_mul_2
344		/* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
345	0	blimb_low = bp[0];
346	0	#endif
347	0	blimb = bp[bsize-1];
348
349	0	TRACE (printf ("big bsize=%ld ", bsize);
350	0	mpn_trace ("b", bp, bsize));
351	0	}
352
353		/* At this point blimb is the most significant limb of the base to use.
354
355		Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
356		limb to round up the division; +1 for multiplies all using an extra
357		limb over the true size; +2 for rl at the end; +1 for lshift at the
358		end.
359
360		The size calculation here is reasonably accurate. The base is at least
361		half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
362		when it will power up as just over 16, an overestimate of 17/16 =
363		6.25%. For a 64-bit limb it's half that.
364
365		If e==0 then blimb won't be anything useful (though it will be
366		non-zero), but that doesn't matter since we just end up with ralloc==5,
367		and that's fine for 2 limbs of rl and 1 of lshift. */
368
369	1.24k	ASSERT (blimb != 0);
370	1.24k	count_leading_zeros (cnt, blimb);
371	1.24k	ralloc = (bsizeGMP_NUMB_BITS - cnt + GMP_NAIL_BITS) e / GMP_NUMB_BITS + 5;
372	1.24k	TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
373	1.24k	ralloc, bsize, blimb, cnt));
374	1.24k	rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
375
376		/* Low zero limbs resulting from powers of 2. */
377	1.24k	MPN_ZERO (rp, rtwos_limbs);
378	1.24k	rp += rtwos_limbs;
379
380	1.24k	if (e == 0)
381	1.24k	{
382		/* Any e==0 other than via bsize==1 or bsize==2 is covered at the
383		start. */
384	1.24k	rp[0] = rl;
385	1.24k	rsize = 1;
386	1.24k	#if HAVE_NATIVE_mpn_mul_2
387	1.24k	rp[1] = rl_high;
388	1.24k	rsize += (rl_high != 0);
389	1.24k	#endif
390	1.24k	ASSERT (rp[rsize-1] != 0);
391	1.24k	}
392	0	else
393	0	{
394	0	mp_ptr tp;
395	0	mp_size_t talloc;
396
397		/* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
398		low bit of e is zero, tp only has to hold the second last power
399		step, which is half the size of the final result. There's no need
400		to round up the divide by 2, since ralloc includes a +2 for rl
401		which not needed by tp. In the mpn_mul loop when the low bit of e
402		is 1, tp must hold nearly the full result, so just size it the same
403		as rp. */
404
405	0	talloc = ralloc;
406	0	#if HAVE_NATIVE_mpn_mul_2
407	0	if (bsize <= 2 \|\| (e & 1) == 0)
408	0	talloc /= 2;
409		#else
410		if (bsize <= 1 \|\| (e & 1) == 0)
411		talloc /= 2;
412		#endif
413	0	TRACE (printf ("talloc %ld\n", talloc));
414	0	tp = TMP_ALLOC_LIMBS (talloc);
415
416		/* Go from high to low over the bits of e, starting with i pointing at
417		the bit below the highest 1 (which will mean i==-1 if e==1). */
418	0	count_leading_zeros (cnt, (mp_limb_t) e);
419	0	i = GMP_LIMB_BITS - cnt - 2;
420
421	0	#if HAVE_NATIVE_mpn_mul_2
422	0	if (bsize <= 2)
423	0	{
424	0	mp_limb_t mult[2];
425
426		/* Any bsize==1 will have been powered above to be two limbs. */
427	0	ASSERT (bsize == 2);
428	0	ASSERT (blimb != 0);
429
430		/* Arrange the final result ends up in r, not in the temp space */
431	0	if ((i & 1) == 0)
432	0	SWAP_RP_TP;
433
434	0	rp[0] = blimb_low;
435	0	rp[1] = blimb;
436	0	rsize = 2;
437
438	0	mult[0] = blimb_low;
439	0	mult[1] = blimb;
440
441	0	for ( ; i >= 0; i--)
442	0	{
443	0	TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
444	0	i, e, rsize, ralloc, talloc);
445	0	mpn_trace ("r", rp, rsize));
446
447	0	MPN_SQR (tp, talloc, rp, rsize);
448	0	SWAP_RP_TP;
449	0	if ((e & (1L << i)) != 0)
450	0	MPN_MUL_2 (rp, rsize, ralloc, mult);
451	0	}
452
453	0	TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
454	0	if (rl_high != 0)
455	0	{
456	0	mult[0] = rl;
457	0	mult[1] = rl_high;
458	0	MPN_MUL_2 (rp, rsize, ralloc, mult);
459	0	}
460	0	else if (rl != 1)
461	0	MPN_MUL_1 (rp, rsize, ralloc, rl);
462	0	}
463		#else
464		if (bsize == 1)
465		{
466		/* Arrange the final result ends up in r, not in the temp space */
467		if ((i & 1) == 0)
468		SWAP_RP_TP;
469
470		rp[0] = blimb;
471		rsize = 1;
472
473		for ( ; i >= 0; i--)
474		{
475		TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
476		i, e, rsize, ralloc, talloc);
477		mpn_trace ("r", rp, rsize));
478
479		MPN_SQR (tp, talloc, rp, rsize);
480		SWAP_RP_TP;
481		if ((e & (1L << i)) != 0)
482		MPN_MUL_1 (rp, rsize, ralloc, blimb);
483		}
484
485		TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
486		if (rl != 1)
487		MPN_MUL_1 (rp, rsize, ralloc, rl);
488		}
489		#endif
490	0	else
491	0	{
492	0	int parity;
493
494		/* Arrange the final result ends up in r, not in the temp space */
495	0	ULONG_PARITY (parity, e);
496	0	if (((parity ^ i) & 1) != 0)
497	0	SWAP_RP_TP;
498
499	0	MPN_COPY (rp, bp, bsize);
500	0	rsize = bsize;
501
502	0	for ( ; i >= 0; i--)
503	0	{
504	0	TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
505	0	i, e, rsize, ralloc, talloc);
506	0	mpn_trace ("r", rp, rsize));
507
508	0	MPN_SQR (tp, talloc, rp, rsize);
509	0	SWAP_RP_TP;
510	0	if ((e & (1L << i)) != 0)
511	0	{
512	0	MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
513	0	SWAP_RP_TP;
514	0	}
515	0	}
516	0	}
517	0	}
518
519	1.24k	ASSERT (rp == PTR(r) + rtwos_limbs);
520	1.24k	TRACE (mpn_trace ("end loop r", rp, rsize));
521	1.24k	TMP_FREE;
522
523		/* Apply any partial limb factors of 2. */
524	1.24k	if (rtwos_bits != 0)
525	9	{
526	9	MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
527	9	TRACE (mpn_trace ("lshift r", rp, rsize));
528	9	}
529
530	1.24k	rsize += rtwos_limbs;
531	1.24k	SIZ(r) = (rneg ? -rsize : rsize);
532	1.24k	}