Coverage Report

Created: 2024-06-28 06:19

/src/gmp-6.2.1/mpz/n_pow_ui.c
Line
Count
Source (jump to first uncovered line)
1
/* mpz_n_pow_ui -- mpn raised to ulong.
2
3
   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5
   FUTURE GNU MP RELEASES.
6
7
Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9
This file is part of the GNU MP Library.
10
11
The GNU MP Library is free software; you can redistribute it and/or modify
12
it under the terms of either:
13
14
  * the GNU Lesser General Public License as published by the Free
15
    Software Foundation; either version 3 of the License, or (at your
16
    option) any later version.
17
18
or
19
20
  * the GNU General Public License as published by the Free Software
21
    Foundation; either version 2 of the License, or (at your option) any
22
    later version.
23
24
or both in parallel, as here.
25
26
The GNU MP Library is distributed in the hope that it will be useful, but
27
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29
for more details.
30
31
You should have received copies of the GNU General Public License and the
32
GNU Lesser General Public License along with the GNU MP Library.  If not,
33
see https://www.gnu.org/licenses/.  */
34
35
#include <stdlib.h>
36
#include <stdio.h>
37
#include "gmp-impl.h"
38
#include "longlong.h"
39
40
41
/* Change this to "#define TRACE(x) x" for some traces. */
42
#define TRACE(x)
43
44
45
/* Use this to test the mul_2 code on a CPU without a native version of that
46
   routine.  */
47
#if 0
48
#define mpn_mul_2  refmpn_mul_2
49
#define HAVE_NATIVE_mpn_mul_2  1
50
#endif
51
52
53
/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54
   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55
   bsize==2 or >2, but separating that isn't easy because there's shared
56
   code both before and after (the size calculations and the powers of 2
57
   handling).
58
59
   Alternatives:
60
61
   It would work to just use the mpn_mul powering loop for 1 and 2 limb
62
   bases, but the current separate loop allows mul_1 and mul_2 to be done
63
   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
64
   to allow source==dest when vn==1 or 2 then some pointer twiddling might
65
   let us get the same effect in one loop.
66
67
   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68
   form the biggest possible power of b that fits, only the biggest power of
69
   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
70
   using mp_bases[b].big_base for small b, and thereby get better value
71
   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
72
   so would be more complicated than it's worth, and could well end up being
73
   a slowdown for small e.  For big e on the other hand the algorithm is
74
   dominated by mpn_sqr so there wouldn't much of a saving.  The current
75
   code can be viewed as simply doing the first few steps of the powering in
76
   a single or double limb where possible.
77
78
   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79
   copy made of b is unnecessary.  We could just use the old alloc'ed block
80
   and free it at the end.  But arranging this seems like a lot more trouble
81
   than it's worth.  */
82
83
84
/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85
   a limb without overflowing.
86
   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88
0
#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91
/* The following are for convenience, they update the size and check the
92
   alloc.  */
93
94
#define MPN_SQR(dst, alloc, src, size)          \
95
0
  do {                                          \
96
0
    ASSERT (2*(size) <= (alloc));               \
97
0
    mpn_sqr (dst, src, size);                   \
98
0
    (size) *= 2;                                \
99
0
    (size) -= ((dst)[(size)-1] == 0);           \
100
0
  } while (0)
101
102
#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
103
0
  do {                                                  \
104
0
    mp_limb_t  cy;                                      \
105
0
    ASSERT ((size) + (size2) <= (alloc));               \
106
0
    cy = mpn_mul (dst, src, size, src2, size2);         \
107
0
    (size) += (size2) - (cy == 0);                      \
108
0
  } while (0)
109
110
#define MPN_MUL_2(ptr, size, alloc, mult)       \
111
0
  do {                                          \
112
0
    mp_limb_t  cy;                              \
113
0
    ASSERT ((size)+2 <= (alloc));               \
114
0
    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
115
0
    (size)++;                                   \
116
0
    (ptr)[(size)] = cy;                         \
117
0
    (size) += (cy != 0);                        \
118
0
  } while (0)
119
120
#define MPN_MUL_1(ptr, size, alloc, limb)       \
121
0
  do {                                          \
122
0
    mp_limb_t  cy;                              \
123
0
    ASSERT ((size)+1 <= (alloc));               \
124
0
    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
125
0
    (ptr)[size] = cy;                           \
126
0
    (size) += (cy != 0);                        \
127
0
  } while (0)
128
129
#define MPN_LSHIFT(ptr, size, alloc, shift)     \
130
0
  do {                                          \
131
0
    mp_limb_t  cy;                              \
132
0
    ASSERT ((size)+1 <= (alloc));               \
133
0
    cy = mpn_lshift (ptr, ptr, size, shift);    \
134
0
    (ptr)[size] = cy;                           \
135
0
    (size) += (cy != 0);                        \
136
0
  } while (0)
137
138
#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
139
0
  do {                                                  \
140
0
    if ((shift) == 0)                                   \
141
0
      MPN_COPY (dst, src, size);                        \
142
0
    else                                                \
143
0
      {                                                 \
144
0
        mpn_rshift (dst, src, size, shift);             \
145
0
        (size) -= ((dst)[(size)-1] == 0);               \
146
0
      }                                                 \
147
0
  } while (0)
148
149
150
/* ralloc and talloc are only wanted for ASSERTs, after the initial space
151
   allocations.  Avoid writing values to them in a normal build, to ensure
152
   the compiler lets them go dead.  gcc already figures this out itself
153
   actually.  */
154
155
#define SWAP_RP_TP                                      \
156
0
  do {                                                  \
157
0
    MP_PTR_SWAP (rp, tp);                               \
158
0
    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
159
0
  } while (0)
160
161
162
void
163
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164
0
{
165
0
  mp_ptr         rp;
166
0
  mp_size_t      rtwos_limbs, ralloc, rsize;
167
0
  int            rneg, i, cnt, btwos, r_bp_overlap;
168
0
  mp_limb_t      blimb, rl;
169
0
  mp_bitcnt_t    rtwos_bits;
170
0
#if HAVE_NATIVE_mpn_mul_2
171
0
  mp_limb_t      blimb_low, rl_high;
172
#else
173
  mp_limb_t      b_twolimbs[2];
174
#endif
175
0
  mp_limb_t ovfl;
176
0
  TMP_DECL;
177
178
0
  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179
0
     PTR(r), bp, bsize, e, e);
180
0
   mpn_trace ("b", bp, bsize));
181
182
0
  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183
0
  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185
  /* b^0 == 1, including 0^0 == 1 */
186
0
  if (e == 0)
187
0
    {
188
0
      MPZ_NEWALLOC (r, 1)[0] = 1;
189
0
      SIZ(r) = 1;
190
0
      return;
191
0
    }
192
193
  /* 0^e == 0 apart from 0^0 above */
194
0
  if (bsize == 0)
195
0
    {
196
0
      SIZ(r) = 0;
197
0
      return;
198
0
    }
199
200
  /* Sign of the final result. */
201
0
  rneg = (bsize < 0 && (e & 1) != 0);
202
0
  bsize = ABS (bsize);
203
0
  TRACE (printf ("rneg %d\n", rneg));
204
205
0
  r_bp_overlap = (PTR(r) == bp);
206
207
  /* Strip low zero limbs from b. */
208
0
  rtwos_limbs = 0;
209
0
  for (blimb = *bp; blimb == 0; blimb = *++bp)
210
0
    {
211
0
      rtwos_limbs += e;
212
0
      bsize--; ASSERT (bsize >= 1);
213
0
    }
214
0
  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216
  /* Strip low zero bits from b. */
217
0
  count_trailing_zeros (btwos, blimb);
218
0
  blimb >>= btwos;
219
220
0
  umul_ppmm (ovfl, rtwos_bits, e, btwos);
221
0
  if (ovfl)
222
0
    {
223
0
      fprintf (stderr, "gmp: overflow in mpz type\n");
224
0
      abort ();
225
0
    }
226
227
0
  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228
0
  rtwos_bits %= GMP_NUMB_BITS;
229
0
  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230
0
     btwos, rtwos_limbs, rtwos_bits));
231
232
0
  TMP_MARK;
233
234
0
  rl = 1;
235
0
#if HAVE_NATIVE_mpn_mul_2
236
0
  rl_high = 0;
237
0
#endif
238
239
0
  if (bsize == 1)
240
0
    {
241
0
    bsize_1:
242
      /* Power up as far as possible within blimb.  We start here with e!=0,
243
   but if e is small then we might reach e==0 and the whole b^e in rl.
244
   Notice this code works when blimb==1 too, reaching e==0.  */
245
246
0
      while (blimb <= GMP_NUMB_HALFMAX)
247
0
  {
248
0
    TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249
0
       e, blimb, rl));
250
0
    ASSERT (e != 0);
251
0
    if ((e & 1) != 0)
252
0
      rl *= blimb;
253
0
    e >>= 1;
254
0
    if (e == 0)
255
0
      goto got_rl;
256
0
    blimb *= blimb;
257
0
  }
258
259
0
#if HAVE_NATIVE_mpn_mul_2
260
0
      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261
0
         e, blimb, rl));
262
263
      /* Can power b once more into blimb:blimb_low */
264
0
      bsize = 2;
265
0
      ASSERT (e != 0);
266
0
      if ((e & 1) != 0)
267
0
  {
268
0
    umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269
0
    rl >>= GMP_NAIL_BITS;
270
0
  }
271
0
      e >>= 1;
272
0
      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273
0
      blimb_low >>= GMP_NAIL_BITS;
274
275
0
    got_rl:
276
0
      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277
0
         e, blimb, blimb_low, rl_high, rl));
278
279
      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280
   final mul_1 or mul_2 rather than a separate lshift.
281
   - rl_high:rl mustn't be 1 (since then there's no final mul)
282
   - rl_high mustn't overflow
283
   - rl_high mustn't change to non-zero, since mul_1+lshift is
284
   probably faster than mul_2 (FIXME: is this true?)  */
285
286
0
      if (rtwos_bits != 0
287
0
    && ! (rl_high == 0 && rl == 1)
288
0
    && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289
0
  {
290
0
    mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
291
0
      | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292
0
    if (! (rl_high == 0 && new_rl_high != 0))
293
0
      {
294
0
        rl_high = new_rl_high;
295
0
        rl <<= rtwos_bits;
296
0
        rtwos_bits = 0;
297
0
        TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298
0
           rl_high, rl));
299
0
      }
300
0
  }
301
#else
302
    got_rl:
303
      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304
         e, blimb, rl));
305
306
      /* Combine left-over rtwos_bits into rl to be handled by the final
307
   mul_1 rather than a separate lshift.
308
   - rl mustn't be 1 (since then there's no final mul)
309
   - rl mustn't overflow  */
310
311
      if (rtwos_bits != 0
312
    && rl != 1
313
    && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314
  {
315
    rl <<= rtwos_bits;
316
    rtwos_bits = 0;
317
    TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318
  }
319
#endif
320
0
    }
321
0
  else if (bsize == 2)
322
0
    {
323
0
      mp_limb_t  bsecond = bp[1];
324
0
      if (btwos != 0)
325
0
  blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326
0
      bsecond >>= btwos;
327
0
      if (bsecond == 0)
328
0
  {
329
    /* Two limbs became one after rshift. */
330
0
    bsize = 1;
331
0
    goto bsize_1;
332
0
  }
333
334
0
      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335
0
#if HAVE_NATIVE_mpn_mul_2
336
0
      blimb_low = blimb;
337
#else
338
      bp = b_twolimbs;
339
      b_twolimbs[0] = blimb;
340
      b_twolimbs[1] = bsecond;
341
#endif
342
0
      blimb = bsecond;
343
0
    }
344
0
  else
345
0
    {
346
0
      if (r_bp_overlap || btwos != 0)
347
0
  {
348
0
    mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349
0
    MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350
0
    bp = tp;
351
0
    TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352
0
  }
353
0
#if HAVE_NATIVE_mpn_mul_2
354
      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355
0
      blimb_low = bp[0];
356
0
#endif
357
0
      blimb = bp[bsize-1];
358
359
0
      TRACE (printf ("big bsize=%ld  ", bsize);
360
0
       mpn_trace ("b", bp, bsize));
361
0
    }
362
363
  /* At this point blimb is the most significant limb of the base to use.
364
365
     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366
     limb to round up the division; +1 for multiplies all using an extra
367
     limb over the true size; +2 for rl at the end; +1 for lshift at the
368
     end.
369
370
     The size calculation here is reasonably accurate.  The base is at least
371
     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372
     when it will power up as just over 16, an overestimate of 17/16 =
373
     6.25%.  For a 64-bit limb it's half that.
374
375
     If e==0 then blimb won't be anything useful (though it will be
376
     non-zero), but that doesn't matter since we just end up with ralloc==5,
377
     and that's fine for 2 limbs of rl and 1 of lshift.  */
378
379
0
  ASSERT (blimb != 0);
380
0
  count_leading_zeros (cnt, blimb);
381
382
0
  umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383
0
  if (ovfl)
384
0
    {
385
0
      fprintf (stderr, "gmp: overflow in mpz type\n");
386
0
      abort ();
387
0
    }
388
0
  ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390
0
  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391
0
     ralloc, bsize, blimb, cnt));
392
0
  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394
  /* Low zero limbs resulting from powers of 2. */
395
0
  MPN_ZERO (rp, rtwos_limbs);
396
0
  rp += rtwos_limbs;
397
398
0
  if (e == 0)
399
0
    {
400
      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401
   start. */
402
0
      rp[0] = rl;
403
0
      rsize = 1;
404
0
#if HAVE_NATIVE_mpn_mul_2
405
0
      rp[1] = rl_high;
406
0
      rsize += (rl_high != 0);
407
0
#endif
408
0
      ASSERT (rp[rsize-1] != 0);
409
0
    }
410
0
  else
411
0
    {
412
0
      mp_ptr     tp;
413
0
      mp_size_t  talloc;
414
415
      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416
   low bit of e is zero, tp only has to hold the second last power
417
   step, which is half the size of the final result.  There's no need
418
   to round up the divide by 2, since ralloc includes a +2 for rl
419
   which not needed by tp.  In the mpn_mul loop when the low bit of e
420
   is 1, tp must hold nearly the full result, so just size it the same
421
   as rp.  */
422
423
0
      talloc = ralloc;
424
0
#if HAVE_NATIVE_mpn_mul_2
425
0
      if (bsize <= 2 || (e & 1) == 0)
426
0
  talloc /= 2;
427
#else
428
      if (bsize <= 1 || (e & 1) == 0)
429
  talloc /= 2;
430
#endif
431
0
      TRACE (printf ("talloc %ld\n", talloc));
432
0
      tp = TMP_ALLOC_LIMBS (talloc);
433
434
      /* Go from high to low over the bits of e, starting with i pointing at
435
   the bit below the highest 1 (which will mean i==-1 if e==1).  */
436
0
      count_leading_zeros (cnt, (mp_limb_t) e);
437
0
      i = GMP_LIMB_BITS - cnt - 2;
438
439
0
#if HAVE_NATIVE_mpn_mul_2
440
0
      if (bsize <= 2)
441
0
  {
442
0
    mp_limb_t  mult[2];
443
444
    /* Any bsize==1 will have been powered above to be two limbs. */
445
0
    ASSERT (bsize == 2);
446
0
    ASSERT (blimb != 0);
447
448
    /* Arrange the final result ends up in r, not in the temp space */
449
0
    if ((i & 1) == 0)
450
0
      SWAP_RP_TP;
451
452
0
    rp[0] = blimb_low;
453
0
    rp[1] = blimb;
454
0
    rsize = 2;
455
456
0
    mult[0] = blimb_low;
457
0
    mult[1] = blimb;
458
459
0
    for ( ; i >= 0; i--)
460
0
      {
461
0
        TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462
0
           i, e, rsize, ralloc, talloc);
463
0
         mpn_trace ("r", rp, rsize));
464
465
0
        MPN_SQR (tp, talloc, rp, rsize);
466
0
        SWAP_RP_TP;
467
0
        if ((e & (1L << i)) != 0)
468
0
    MPN_MUL_2 (rp, rsize, ralloc, mult);
469
0
      }
470
471
0
    TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472
0
    if (rl_high != 0)
473
0
      {
474
0
        mult[0] = rl;
475
0
        mult[1] = rl_high;
476
0
        MPN_MUL_2 (rp, rsize, ralloc, mult);
477
0
      }
478
0
    else if (rl != 1)
479
0
      MPN_MUL_1 (rp, rsize, ralloc, rl);
480
0
  }
481
#else
482
      if (bsize == 1)
483
  {
484
    /* Arrange the final result ends up in r, not in the temp space */
485
    if ((i & 1) == 0)
486
      SWAP_RP_TP;
487
488
    rp[0] = blimb;
489
    rsize = 1;
490
491
    for ( ; i >= 0; i--)
492
      {
493
        TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494
           i, e, rsize, ralloc, talloc);
495
         mpn_trace ("r", rp, rsize));
496
497
        MPN_SQR (tp, talloc, rp, rsize);
498
        SWAP_RP_TP;
499
        if ((e & (1L << i)) != 0)
500
    MPN_MUL_1 (rp, rsize, ralloc, blimb);
501
      }
502
503
    TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504
    if (rl != 1)
505
      MPN_MUL_1 (rp, rsize, ralloc, rl);
506
  }
507
#endif
508
0
      else
509
0
  {
510
0
    int  parity;
511
512
    /* Arrange the final result ends up in r, not in the temp space */
513
0
    ULONG_PARITY (parity, e);
514
0
    if (((parity ^ i) & 1) != 0)
515
0
      SWAP_RP_TP;
516
517
0
    MPN_COPY (rp, bp, bsize);
518
0
    rsize = bsize;
519
520
0
    for ( ; i >= 0; i--)
521
0
      {
522
0
        TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523
0
           i, e, rsize, ralloc, talloc);
524
0
         mpn_trace ("r", rp, rsize));
525
526
0
        MPN_SQR (tp, talloc, rp, rsize);
527
0
        SWAP_RP_TP;
528
0
        if ((e & (1L << i)) != 0)
529
0
    {
530
0
      MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531
0
      SWAP_RP_TP;
532
0
    }
533
0
      }
534
0
  }
535
0
    }
536
537
0
  ASSERT (rp == PTR(r) + rtwos_limbs);
538
0
  TRACE (mpn_trace ("end loop r", rp, rsize));
539
0
  TMP_FREE;
540
541
  /* Apply any partial limb factors of 2. */
542
0
  if (rtwos_bits != 0)
543
0
    {
544
0
      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545
0
      TRACE (mpn_trace ("lshift r", rp, rsize));
546
0
    }
547
548
0
  rsize += rtwos_limbs;
549
0
  SIZ(r) = (rneg ? -rsize : rsize);
550
0
}