Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpz/n_pow_ui.c
Line
Count
Source (jump to first uncovered line)
1
/* mpz_n_pow_ui -- mpn raised to ulong.
2
3
   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4
   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5
   FUTURE GNU MP RELEASES.
6
7
Copyright 2001, 2002, 2005, 2012, 2015, 2020 Free Software Foundation, Inc.
8
9
This file is part of the GNU MP Library.
10
11
The GNU MP Library is free software; you can redistribute it and/or modify
12
it under the terms of either:
13
14
  * the GNU Lesser General Public License as published by the Free
15
    Software Foundation; either version 3 of the License, or (at your
16
    option) any later version.
17
18
or
19
20
  * the GNU General Public License as published by the Free Software
21
    Foundation; either version 2 of the License, or (at your option) any
22
    later version.
23
24
or both in parallel, as here.
25
26
The GNU MP Library is distributed in the hope that it will be useful, but
27
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29
for more details.
30
31
You should have received copies of the GNU General Public License and the
32
GNU Lesser General Public License along with the GNU MP Library.  If not,
33
see https://www.gnu.org/licenses/.  */
34
35
#include <stdlib.h>
36
#include <stdio.h>
37
#include "gmp-impl.h"
38
#include "longlong.h"
39
40
41
/* Change this to "#define TRACE(x) x" for some traces. */
42
#define TRACE(x)
43
44
45
/* Use this to test the mul_2 code on a CPU without a native version of that
46
   routine.  */
47
#if 0
48
#define mpn_mul_2  refmpn_mul_2
49
#define HAVE_NATIVE_mpn_mul_2  1
50
#endif
51
52
53
/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
54
   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
55
   bsize==2 or >2, but separating that isn't easy because there's shared
56
   code both before and after (the size calculations and the powers of 2
57
   handling).
58
59
   Alternatives:
60
61
   It would work to just use the mpn_mul powering loop for 1 and 2 limb
62
   bases, but the current separate loop allows mul_1 and mul_2 to be done
63
   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
64
   to allow source==dest when vn==1 or 2 then some pointer twiddling might
65
   let us get the same effect in one loop.
66
67
   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
68
   form the biggest possible power of b that fits, only the biggest power of
69
   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
70
   using mp_bases[b].big_base for small b, and thereby get better value
71
   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
72
   so would be more complicated than it's worth, and could well end up being
73
   a slowdown for small e.  For big e on the other hand the algorithm is
74
   dominated by mpn_sqr so there wouldn't much of a saving.  The current
75
   code can be viewed as simply doing the first few steps of the powering in
76
   a single or double limb where possible.
77
78
   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
79
   copy made of b is unnecessary.  We could just use the old alloc'ed block
80
   and free it at the end.  But arranging this seems like a lot more trouble
81
   than it's worth.  */
82
83
84
/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
85
   a limb without overflowing.
86
   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
87
88
477
#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
89
90
91
/* The following are for convenience, they update the size and check the
92
   alloc.  */
93
94
#define MPN_SQR(dst, alloc, src, size)          \
95
52
  do {                                          \
96
52
    ASSERT (2*(size) <= (alloc));               \
97
52
    mpn_sqr (dst, src, size);                   \
98
52
    (size) *= 2;                                \
99
52
    (size) -= ((dst)[(size)-1] == 0);           \
100
52
  } while (0)
101
102
#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
103
0
  do {                                                  \
104
0
    mp_limb_t  cy;                                      \
105
0
    ASSERT ((size) + (size2) <= (alloc));               \
106
0
    cy = mpn_mul (dst, src, size, src2, size2);         \
107
0
    (size) += (size2) - (cy == 0);                      \
108
0
  } while (0)
109
110
#define MPN_MUL_2(ptr, size, alloc, mult)       \
111
45
  do {                                          \
112
45
    mp_limb_t  cy;                              \
113
45
    ASSERT ((size)+2 <= (alloc));               \
114
45
    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
115
45
    (size)++;                                   \
116
45
    (ptr)[(size)] = cy;                         \
117
45
    (size) += (cy != 0);                        \
118
45
  } while (0)
119
120
#define MPN_MUL_1(ptr, size, alloc, limb)       \
121
38
  do {                                          \
122
38
    mp_limb_t  cy;                              \
123
38
    ASSERT ((size)+1 <= (alloc));               \
124
38
    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
125
38
    (ptr)[size] = cy;                           \
126
38
    (size) += (cy != 0);                        \
127
38
  } while (0)
128
129
#define MPN_LSHIFT(ptr, size, alloc, shift)     \
130
53
  do {                                          \
131
53
    mp_limb_t  cy;                              \
132
53
    ASSERT ((size)+1 <= (alloc));               \
133
53
    cy = mpn_lshift (ptr, ptr, size, shift);    \
134
53
    (ptr)[size] = cy;                           \
135
53
    (size) += (cy != 0);                        \
136
53
  } while (0)
137
138
#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
139
0
  do {                                                  \
140
0
    if ((shift) == 0)                                   \
141
0
      MPN_COPY (dst, src, size);                        \
142
0
    else                                                \
143
0
      {                                                 \
144
0
        mpn_rshift (dst, src, size, shift);             \
145
0
        (size) -= ((dst)[(size)-1] == 0);               \
146
0
      }                                                 \
147
0
  } while (0)
148
149
150
/* ralloc and talloc are only wanted for ASSERTs, after the initial space
151
   allocations.  Avoid writing values to them in a normal build, to ensure
152
   the compiler lets them go dead.  gcc already figures this out itself
153
   actually.  */
154
155
#define SWAP_RP_TP                                      \
156
70
  do {                                                  \
157
70
    MP_PTR_SWAP (rp, tp);                               \
158
70
    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
159
70
  } while (0)
160
161
162
void
163
mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
164
108
{
165
108
  mp_ptr         rp;
166
108
  mp_size_t      rtwos_limbs, ralloc, rsize;
167
108
  int            rneg, i, cnt, btwos, r_bp_overlap;
168
108
  mp_limb_t      blimb, rl;
169
108
  mp_bitcnt_t    rtwos_bits;
170
108
#if HAVE_NATIVE_mpn_mul_2
171
108
  mp_limb_t      blimb_low, rl_high;
172
#else
173
  mp_limb_t      b_twolimbs[2];
174
#endif
175
108
  mp_limb_t ovfl;
176
108
  TMP_DECL;
177
178
108
  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
179
108
     PTR(r), bp, bsize, e, e);
180
108
   mpn_trace ("b", bp, bsize));
181
182
108
  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
183
108
  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
184
185
  /* b^0 == 1, including 0^0 == 1 */
186
108
  if (e == 0)
187
3
    {
188
3
      MPZ_NEWALLOC (r, 1)[0] = 1;
189
3
      SIZ(r) = 1;
190
3
      return;
191
3
    }
192
193
  /* 0^e == 0 apart from 0^0 above */
194
105
  if (bsize == 0)
195
3
    {
196
3
      SIZ(r) = 0;
197
3
      return;
198
3
    }
199
200
  /* Sign of the final result. */
201
102
  rneg = (bsize < 0 && (e & 1) != 0);
202
102
  bsize = ABS (bsize);
203
102
  TRACE (printf ("rneg %d\n", rneg));
204
205
102
  r_bp_overlap = (PTR(r) == bp);
206
207
  /* Strip low zero limbs from b. */
208
102
  rtwos_limbs = 0;
209
102
  for (blimb = *bp; blimb == 0; blimb = *++bp)
210
0
    {
211
0
      rtwos_limbs += e;
212
0
      bsize--; ASSERT (bsize >= 1);
213
0
    }
214
102
  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
215
216
  /* Strip low zero bits from b. */
217
102
  count_trailing_zeros (btwos, blimb);
218
102
  blimb >>= btwos;
219
220
102
  umul_ppmm (ovfl, rtwos_bits, e, btwos);
221
102
  if (ovfl)
222
0
    {
223
0
      fprintf (stderr, "gmp: overflow in mpz type\n");
224
0
      abort ();
225
0
    }
226
227
102
  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
228
102
  rtwos_bits %= GMP_NUMB_BITS;
229
102
  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
230
102
     btwos, rtwos_limbs, rtwos_bits));
231
232
102
  TMP_MARK;
233
234
102
  rl = 1;
235
102
#if HAVE_NATIVE_mpn_mul_2
236
102
  rl_high = 0;
237
102
#endif
238
239
102
  if (bsize == 1)
240
102
    {
241
102
    bsize_1:
242
      /* Power up as far as possible within blimb.  We start here with e!=0,
243
   but if e is small then we might reach e==0 and the whole b^e in rl.
244
   Notice this code works when blimb==1 too, reaching e==0.  */
245
246
477
      while (blimb <= GMP_NUMB_HALFMAX)
247
389
  {
248
389
    TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
249
389
       e, blimb, rl));
250
389
    ASSERT (e != 0);
251
389
    if ((e & 1) != 0)
252
208
      rl *= blimb;
253
389
    e >>= 1;
254
389
    if (e == 0)
255
14
      goto got_rl;
256
375
    blimb *= blimb;
257
375
  }
258
259
88
#if HAVE_NATIVE_mpn_mul_2
260
88
      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
261
88
         e, blimb, rl));
262
263
      /* Can power b once more into blimb:blimb_low */
264
88
      bsize = 2;
265
88
      ASSERT (e != 0);
266
88
      if ((e & 1) != 0)
267
52
  {
268
52
    umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
269
52
    rl >>= GMP_NAIL_BITS;
270
52
  }
271
88
      e >>= 1;
272
88
      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
273
88
      blimb_low >>= GMP_NAIL_BITS;
274
275
102
    got_rl:
276
102
      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
277
102
         e, blimb, blimb_low, rl_high, rl));
278
279
      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
280
   final mul_1 or mul_2 rather than a separate lshift.
281
   - rl_high:rl mustn't be 1 (since then there's no final mul)
282
   - rl_high mustn't overflow
283
   - rl_high mustn't change to non-zero, since mul_1+lshift is
284
   probably faster than mul_2 (FIXME: is this true?)  */
285
286
102
      if (rtwos_bits != 0
287
102
    && ! (rl_high == 0 && rl == 1)
288
102
    && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
289
52
  {
290
52
    mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
291
52
      | (rl >> (GMP_NUMB_BITS-rtwos_bits));
292
52
    if (! (rl_high == 0 && new_rl_high != 0))
293
30
      {
294
30
        rl_high = new_rl_high;
295
30
        rl <<= rtwos_bits;
296
30
        rtwos_bits = 0;
297
30
        TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
298
30
           rl_high, rl));
299
30
      }
300
52
  }
301
#else
302
    got_rl:
303
      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
304
         e, blimb, rl));
305
306
      /* Combine left-over rtwos_bits into rl to be handled by the final
307
   mul_1 rather than a separate lshift.
308
   - rl mustn't be 1 (since then there's no final mul)
309
   - rl mustn't overflow  */
310
311
      if (rtwos_bits != 0
312
    && rl != 1
313
    && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
314
  {
315
    rl <<= rtwos_bits;
316
    rtwos_bits = 0;
317
    TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
318
  }
319
#endif
320
102
    }
321
0
  else if (bsize == 2)
322
0
    {
323
0
      mp_limb_t  bsecond = bp[1];
324
0
      if (btwos != 0)
325
0
  blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
326
0
      bsecond >>= btwos;
327
0
      if (bsecond == 0)
328
0
  {
329
    /* Two limbs became one after rshift. */
330
0
    bsize = 1;
331
0
    goto bsize_1;
332
0
  }
333
334
0
      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
335
0
#if HAVE_NATIVE_mpn_mul_2
336
0
      blimb_low = blimb;
337
#else
338
      bp = b_twolimbs;
339
      b_twolimbs[0] = blimb;
340
      b_twolimbs[1] = bsecond;
341
#endif
342
0
      blimb = bsecond;
343
0
    }
344
0
  else
345
0
    {
346
0
      if (r_bp_overlap || btwos != 0)
347
0
  {
348
0
    mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
349
0
    MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
350
0
    bp = tp;
351
0
    TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
352
0
  }
353
0
#if HAVE_NATIVE_mpn_mul_2
354
      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
355
0
      blimb_low = bp[0];
356
0
#endif
357
0
      blimb = bp[bsize-1];
358
359
0
      TRACE (printf ("big bsize=%ld  ", bsize);
360
0
       mpn_trace ("b", bp, bsize));
361
0
    }
362
363
  /* At this point blimb is the most significant limb of the base to use.
364
365
     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
366
     limb to round up the division; +1 for multiplies all using an extra
367
     limb over the true size; +2 for rl at the end; +1 for lshift at the
368
     end.
369
370
     The size calculation here is reasonably accurate.  The base is at least
371
     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
372
     when it will power up as just over 16, an overestimate of 17/16 =
373
     6.25%.  For a 64-bit limb it's half that.
374
375
     If e==0 then blimb won't be anything useful (though it will be
376
     non-zero), but that doesn't matter since we just end up with ralloc==5,
377
     and that's fine for 2 limbs of rl and 1 of lshift.  */
378
379
102
  ASSERT (blimb != 0);
380
102
  count_leading_zeros (cnt, blimb);
381
382
102
  umul_ppmm (ovfl, ralloc, (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS), e);
383
102
  if (ovfl)
384
0
    {
385
0
      fprintf (stderr, "gmp: overflow in mpz type\n");
386
0
      abort ();
387
0
    }
388
102
  ralloc = ralloc / GMP_NUMB_BITS + 5;
389
390
102
  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
391
102
     ralloc, bsize, blimb, cnt));
392
102
  rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
393
394
  /* Low zero limbs resulting from powers of 2. */
395
102
  MPN_ZERO (rp, rtwos_limbs);
396
102
  rp += rtwos_limbs;
397
398
102
  if (e == 0)
399
29
    {
400
      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
401
   start. */
402
29
      rp[0] = rl;
403
29
      rsize = 1;
404
29
#if HAVE_NATIVE_mpn_mul_2
405
29
      rp[1] = rl_high;
406
29
      rsize += (rl_high != 0);
407
29
#endif
408
29
      ASSERT (rp[rsize-1] != 0);
409
29
    }
410
73
  else
411
73
    {
412
73
      mp_ptr     tp;
413
73
      mp_size_t  talloc;
414
415
      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
416
   low bit of e is zero, tp only has to hold the second last power
417
   step, which is half the size of the final result.  There's no need
418
   to round up the divide by 2, since ralloc includes a +2 for rl
419
   which not needed by tp.  In the mpn_mul loop when the low bit of e
420
   is 1, tp must hold nearly the full result, so just size it the same
421
   as rp.  */
422
423
73
      talloc = ralloc;
424
73
#if HAVE_NATIVE_mpn_mul_2
425
73
      if (bsize <= 2 || (e & 1) == 0)
426
73
  talloc /= 2;
427
#else
428
      if (bsize <= 1 || (e & 1) == 0)
429
  talloc /= 2;
430
#endif
431
73
      TRACE (printf ("talloc %ld\n", talloc));
432
73
      tp = TMP_ALLOC_LIMBS (talloc);
433
434
      /* Go from high to low over the bits of e, starting with i pointing at
435
   the bit below the highest 1 (which will mean i==-1 if e==1).  */
436
73
      count_leading_zeros (cnt, (mp_limb_t) e);
437
73
      i = GMP_LIMB_BITS - cnt - 2;
438
439
73
#if HAVE_NATIVE_mpn_mul_2
440
73
      if (bsize <= 2)
441
73
  {
442
73
    mp_limb_t  mult[2];
443
444
    /* Any bsize==1 will have been powered above to be two limbs. */
445
73
    ASSERT (bsize == 2);
446
73
    ASSERT (blimb != 0);
447
448
    /* Arrange the final result ends up in r, not in the temp space */
449
73
    if ((i & 1) == 0)
450
18
      SWAP_RP_TP;
451
452
73
    rp[0] = blimb_low;
453
73
    rp[1] = blimb;
454
73
    rsize = 2;
455
456
73
    mult[0] = blimb_low;
457
73
    mult[1] = blimb;
458
459
125
    for ( ; i >= 0; i--)
460
52
      {
461
52
        TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
462
52
           i, e, rsize, ralloc, talloc);
463
52
         mpn_trace ("r", rp, rsize));
464
465
52
        MPN_SQR (tp, talloc, rp, rsize);
466
52
        SWAP_RP_TP;
467
52
        if ((e & (1L << i)) != 0)
468
16
    MPN_MUL_2 (rp, rsize, ralloc, mult);
469
52
      }
470
471
73
    TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
472
73
    if (rl_high != 0)
473
29
      {
474
29
        mult[0] = rl;
475
29
        mult[1] = rl_high;
476
29
        MPN_MUL_2 (rp, rsize, ralloc, mult);
477
29
      }
478
44
    else if (rl != 1)
479
38
      MPN_MUL_1 (rp, rsize, ralloc, rl);
480
73
  }
481
#else
482
      if (bsize == 1)
483
  {
484
    /* Arrange the final result ends up in r, not in the temp space */
485
    if ((i & 1) == 0)
486
      SWAP_RP_TP;
487
488
    rp[0] = blimb;
489
    rsize = 1;
490
491
    for ( ; i >= 0; i--)
492
      {
493
        TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
494
           i, e, rsize, ralloc, talloc);
495
         mpn_trace ("r", rp, rsize));
496
497
        MPN_SQR (tp, talloc, rp, rsize);
498
        SWAP_RP_TP;
499
        if ((e & (1L << i)) != 0)
500
    MPN_MUL_1 (rp, rsize, ralloc, blimb);
501
      }
502
503
    TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
504
    if (rl != 1)
505
      MPN_MUL_1 (rp, rsize, ralloc, rl);
506
  }
507
#endif
508
0
      else
509
0
  {
510
0
    int  parity;
511
512
    /* Arrange the final result ends up in r, not in the temp space */
513
0
    ULONG_PARITY (parity, e);
514
0
    if (((parity ^ i) & 1) != 0)
515
0
      SWAP_RP_TP;
516
517
0
    MPN_COPY (rp, bp, bsize);
518
0
    rsize = bsize;
519
520
0
    for ( ; i >= 0; i--)
521
0
      {
522
0
        TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
523
0
           i, e, rsize, ralloc, talloc);
524
0
         mpn_trace ("r", rp, rsize));
525
526
0
        MPN_SQR (tp, talloc, rp, rsize);
527
0
        SWAP_RP_TP;
528
0
        if ((e & (1L << i)) != 0)
529
0
    {
530
0
      MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
531
0
      SWAP_RP_TP;
532
0
    }
533
0
      }
534
0
  }
535
73
    }
536
537
102
  ASSERT (rp == PTR(r) + rtwos_limbs);
538
102
  TRACE (mpn_trace ("end loop r", rp, rsize));
539
102
  TMP_FREE;
540
541
  /* Apply any partial limb factors of 2. */
542
102
  if (rtwos_bits != 0)
543
53
    {
544
53
      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
545
53
      TRACE (mpn_trace ("lshift r", rp, rsize));
546
53
    }
547
548
102
  rsize += rtwos_limbs;
549
102
  SIZ(r) = (rneg ? -rsize : rsize);
550
102
}