Coverage Report

Created: 2024-06-28 06:19

/src/gmp-6.2.1/mpn/mul_fft.c
Line
Count
Source (jump to first uncovered line)
1
/* Schoenhage's fast multiplication modulo 2^N+1.
2
3
   Contributed by Paul Zimmermann.
4
5
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7
   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
Copyright 1998-2010, 2012, 2013, 2018, 2020 Free Software Foundation, Inc.
10
11
This file is part of the GNU MP Library.
12
13
The GNU MP Library is free software; you can redistribute it and/or modify
14
it under the terms of either:
15
16
  * the GNU Lesser General Public License as published by the Free
17
    Software Foundation; either version 3 of the License, or (at your
18
    option) any later version.
19
20
or
21
22
  * the GNU General Public License as published by the Free Software
23
    Foundation; either version 2 of the License, or (at your option) any
24
    later version.
25
26
or both in parallel, as here.
27
28
The GNU MP Library is distributed in the hope that it will be useful, but
29
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31
for more details.
32
33
You should have received copies of the GNU General Public License and the
34
GNU Lesser General Public License along with the GNU MP Library.  If not,
35
see https://www.gnu.org/licenses/.  */
36
37
38
/* References:
39
40
   Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker
41
   Strassen, Computing 7, p. 281-292, 1971.
42
43
   Asymptotically fast algorithms for the numerical multiplication and division
44
   of polynomials with complex coefficients, by Arnold Schoenhage, Computer
45
   Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
46
47
   Tapes versus Pointers, a study in implementing fast algorithms, by Arnold
48
   Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
49
50
   TODO:
51
52
   Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and
53
   Zimmermann.
54
55
   It might be possible to avoid a small number of MPN_COPYs by using a
56
   rotating temporary or two.
57
58
   Cleanup and simplify the code!
59
*/
60
61
#ifdef TRACE
62
#undef TRACE
63
#define TRACE(x) x
64
#include <stdio.h>
65
#else
66
#define TRACE(x)
67
#endif
68
69
#include "gmp-impl.h"
70
71
#ifdef WANT_ADDSUB
72
#include "generic/add_n_sub_n.c"
73
#define HAVE_NATIVE_mpn_add_n_sub_n 1
74
#endif
75
76
static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *,
77
               mp_ptr *, mp_ptr, mp_ptr, mp_size_t,
78
               mp_size_t, mp_size_t, int **, mp_ptr, int);
79
static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr,
80
           mp_size_t, mp_size_t, mp_size_t, mp_ptr);
81
82
83
/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
84
   We have sqr=0 if for a multiply, sqr=1 for a square.
85
   There are three generations of this code; we keep the old ones as long as
86
   some gmp-mparam.h is not updated.  */
87
88
89
/*****************************************************************************/
90
91
#if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3))
92
93
#ifndef FFT_TABLE3_SIZE   /* When tuning this is defined in gmp-impl.h */
94
#if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE)
95
#if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE
96
#define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE
97
#else
98
#define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE
99
#endif
100
#endif
101
#endif
102
103
#ifndef FFT_TABLE3_SIZE
104
#define FFT_TABLE3_SIZE 200
105
#endif
106
107
FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] =
108
{
109
  MUL_FFT_TABLE3,
110
  SQR_FFT_TABLE3
111
};
112
113
int
114
mpn_fft_best_k (mp_size_t n, int sqr)
115
0
{
116
0
  const struct fft_table_nk *fft_tab, *tab;
117
0
  mp_size_t tab_n, thres;
118
0
  int last_k;
119
120
0
  fft_tab = mpn_fft_table3[sqr];
121
0
  last_k = fft_tab->k;
122
0
  for (tab = fft_tab + 1; ; tab++)
123
0
    {
124
0
      tab_n = tab->n;
125
0
      thres = tab_n << last_k;
126
0
      if (n <= thres)
127
0
  break;
128
0
      last_k = tab->k;
129
0
    }
130
0
  return last_k;
131
0
}
132
133
#define MPN_FFT_BEST_READY 1
134
#endif
135
136
/*****************************************************************************/
137
138
#if ! defined (MPN_FFT_BEST_READY)
139
FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] =
140
{
141
  MUL_FFT_TABLE,
142
  SQR_FFT_TABLE
143
};
144
145
int
146
mpn_fft_best_k (mp_size_t n, int sqr)
147
{
148
  int i;
149
150
  for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
151
    if (n < mpn_fft_table[sqr][i])
152
      return i + FFT_FIRST_K;
153
154
  /* treat 4*last as one further entry */
155
  if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1])
156
    return i + FFT_FIRST_K;
157
  else
158
    return i + FFT_FIRST_K + 1;
159
}
160
#endif
161
162
/*****************************************************************************/
163
164
165
/* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
166
   i.e. smallest multiple of 2^k >= pl.
167
168
   Don't declare static: needed by tuneup.
169
*/
170
171
mp_size_t
172
mpn_fft_next_size (mp_size_t pl, int k)
173
0
{
174
0
  pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */
175
0
  return pl << k;
176
0
}
177
178
179
/* Initialize l[i][j] with bitrev(j) */
180
static void
181
mpn_fft_initl (int **l, int k)
182
0
{
183
0
  int i, j, K;
184
0
  int *li;
185
186
0
  l[0][0] = 0;
187
0
  for (i = 1, K = 1; i <= k; i++, K *= 2)
188
0
    {
189
0
      li = l[i];
190
0
      for (j = 0; j < K; j++)
191
0
  {
192
0
    li[j] = 2 * l[i - 1][j];
193
0
    li[K + j] = 1 + li[j];
194
0
  }
195
0
    }
196
0
}
197
198
199
/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
200
   Assumes a is semi-normalized, i.e. a[n] <= 1.
201
   r and a must have n+1 limbs, and not overlap.
202
*/
203
static void
204
mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n)
205
0
{
206
0
  unsigned int sh;
207
0
  mp_size_t m;
208
0
  mp_limb_t cc, rd;
209
210
0
  sh = d % GMP_NUMB_BITS;
211
0
  m = d / GMP_NUMB_BITS;
212
213
0
  if (m >= n)     /* negate */
214
0
    {
215
      /* r[0..m-1]  <-- lshift(a[n-m]..a[n-1], sh)
216
   r[m..n-1]  <-- -lshift(a[0]..a[n-m-1],  sh) */
217
218
0
      m -= n;
219
0
      if (sh != 0)
220
0
  {
221
    /* no out shift below since a[n] <= 1 */
222
0
    mpn_lshift (r, a + n - m, m + 1, sh);
223
0
    rd = r[m];
224
0
    cc = mpn_lshiftc (r + m, a, n - m, sh);
225
0
  }
226
0
      else
227
0
  {
228
0
    MPN_COPY (r, a + n - m, m);
229
0
    rd = a[n];
230
0
    mpn_com (r + m, a, n - m);
231
0
    cc = 0;
232
0
  }
233
234
      /* add cc to r[0], and add rd to r[m] */
235
236
      /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */
237
238
0
      r[n] = 0;
239
      /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */
240
0
      cc++;
241
0
      mpn_incr_u (r, cc);
242
243
0
      rd++;
244
      /* rd might overflow when sh=GMP_NUMB_BITS-1 */
245
0
      cc = (rd == 0) ? 1 : rd;
246
0
      r = r + m + (rd == 0);
247
0
      mpn_incr_u (r, cc);
248
0
    }
249
0
  else
250
0
    {
251
      /* r[0..m-1]  <-- -lshift(a[n-m]..a[n-1], sh)
252
   r[m..n-1]  <-- lshift(a[0]..a[n-m-1],  sh)  */
253
0
      if (sh != 0)
254
0
  {
255
    /* no out bits below since a[n] <= 1 */
256
0
    mpn_lshiftc (r, a + n - m, m + 1, sh);
257
0
    rd = ~r[m];
258
    /* {r, m+1} = {a+n-m, m+1} << sh */
259
0
    cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<<sh */
260
0
  }
261
0
      else
262
0
  {
263
    /* r[m] is not used below, but we save a test for m=0 */
264
0
    mpn_com (r, a + n - m, m + 1);
265
0
    rd = a[n];
266
0
    MPN_COPY (r + m, a, n - m);
267
0
    cc = 0;
268
0
  }
269
270
      /* now complement {r, m}, subtract cc from r[0], subtract rd from r[m] */
271
272
      /* if m=0 we just have r[0]=a[n] << sh */
273
0
      if (m != 0)
274
0
  {
275
    /* now add 1 in r[0], subtract 1 in r[m] */
276
0
    if (cc-- == 0) /* then add 1 to r[0] */
277
0
      cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
278
0
    cc = mpn_sub_1 (r, r, m, cc) + 1;
279
    /* add 1 to cc instead of rd since rd might overflow */
280
0
  }
281
282
      /* now subtract cc and rd from r[m..n] */
283
284
0
      r[n] = -mpn_sub_1 (r + m, r + m, n - m, cc);
285
0
      r[n] -= mpn_sub_1 (r + m, r + m, n - m, rd);
286
0
      if (r[n] & GMP_LIMB_HIGHBIT)
287
0
  r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
288
0
    }
289
0
}
290
291
#if HAVE_NATIVE_mpn_add_n_sub_n
292
static inline void
293
mpn_fft_add_sub_modF (mp_ptr A0, mp_ptr Ai, mp_srcptr tp, mp_size_t n)
294
{
295
  mp_limb_t cyas, c, x;
296
297
  cyas = mpn_add_n_sub_n (A0, Ai, A0, tp, n);
298
299
  c = A0[n] - tp[n] - (cyas & 1);
300
  x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0);
301
  Ai[n] = x + c;
302
  MPN_INCR_U (Ai, n + 1, x);
303
304
  c = A0[n] + tp[n] + (cyas >> 1);
305
  x = (c - 1) & -(c != 0);
306
  A0[n] = c - x;
307
  MPN_DECR_U (A0, n + 1, x);
308
}
309
310
#else /* ! HAVE_NATIVE_mpn_add_n_sub_n  */
311
312
/* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1.
313
   Assumes a and b are semi-normalized.
314
*/
315
static inline void
316
mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
317
0
{
318
0
  mp_limb_t c, x;
319
320
0
  c = a[n] + b[n] + mpn_add_n (r, a, b, n);
321
  /* 0 <= c <= 3 */
322
323
0
#if 1
324
  /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
325
     result is slower code, of course.  But the following outsmarts GCC.  */
326
0
  x = (c - 1) & -(c != 0);
327
0
  r[n] = c - x;
328
0
  MPN_DECR_U (r, n + 1, x);
329
0
#endif
330
#if 0
331
  if (c > 1)
332
    {
333
      r[n] = 1;                       /* r[n] - c = 1 */
334
      MPN_DECR_U (r, n + 1, c - 1);
335
    }
336
  else
337
    {
338
      r[n] = c;
339
    }
340
#endif
341
0
}
342
343
/* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1.
344
   Assumes a and b are semi-normalized.
345
*/
346
static inline void
347
mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
348
0
{
349
0
  mp_limb_t c, x;
350
351
0
  c = a[n] - b[n] - mpn_sub_n (r, a, b, n);
352
  /* -2 <= c <= 1 */
353
354
0
#if 1
355
  /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
356
     result is slower code, of course.  But the following outsmarts GCC.  */
357
0
  x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0);
358
0
  r[n] = x + c;
359
0
  MPN_INCR_U (r, n + 1, x);
360
0
#endif
361
#if 0
362
  if ((c & GMP_LIMB_HIGHBIT) != 0)
363
    {
364
      r[n] = 0;
365
      MPN_INCR_U (r, n + 1, -c);
366
    }
367
  else
368
    {
369
      r[n] = c;
370
    }
371
#endif
372
0
}
373
#endif /* HAVE_NATIVE_mpn_add_n_sub_n */
374
375
/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
376
    N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
377
   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
378
379
static void
380
mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll,
381
       mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp)
382
0
{
383
0
  if (K == 2)
384
0
    {
385
0
      mp_limb_t cy;
386
#if HAVE_NATIVE_mpn_add_n_sub_n
387
      cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
388
#else
389
0
      MPN_COPY (tp, Ap[0], n + 1);
390
0
      mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
391
0
      cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
392
0
#endif
393
0
      if (Ap[0][n] > 1) /* can be 2 or 3 */
394
0
  Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
395
0
      if (cy) /* Ap[inc][n] can be -1 or -2 */
396
0
  Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1);
397
0
    }
398
0
  else
399
0
    {
400
0
      mp_size_t j, K2 = K >> 1;
401
0
      int *lk = *ll;
402
403
0
      mpn_fft_fft (Ap,     K2, ll-1, 2 * omega, n, inc * 2, tp);
404
0
      mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp);
405
      /* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
406
   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
407
0
      for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc)
408
0
  {
409
    /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega)
410
       Ap[0]   <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */
411
0
    mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n);
412
#if HAVE_NATIVE_mpn_add_n_sub_n
413
    mpn_fft_add_sub_modF (Ap[0], Ap[inc], tp, n);
414
#else
415
0
    mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n);
416
0
    mpn_fft_add_modF (Ap[0],   Ap[0], tp, n);
417
0
#endif
418
0
  }
419
0
    }
420
0
}
421
422
/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
423
    N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
424
   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1
425
   tp must have space for 2*(n+1) limbs.
426
*/
427
428
429
/* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1,
430
   by subtracting that modulus if necessary.
431
432
   If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a
433
   borrow and the limbs must be zeroed out again.  This will occur very
434
   infrequently.  */
435
436
static inline void
437
mpn_fft_normalize (mp_ptr ap, mp_size_t n)
438
0
{
439
0
  if (ap[n] != 0)
440
0
    {
441
0
      MPN_DECR_U (ap, n + 1, CNST_LIMB(1));
442
0
      if (ap[n] == 0)
443
0
  {
444
    /* This happens with very low probability; we have yet to trigger it,
445
       and thereby make sure this code is correct.  */
446
0
    MPN_ZERO (ap, n);
447
0
    ap[n] = 1;
448
0
  }
449
0
      else
450
0
  ap[n] = 0;
451
0
    }
452
0
}
453
454
/* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */
455
static void
456
mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
457
0
{
458
0
  int i;
459
0
  int sqr = (ap == bp);
460
0
  TMP_DECL;
461
462
0
  TMP_MARK;
463
464
0
  if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
465
0
    {
466
0
      mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
467
0
      int k;
468
0
      int **fft_l, *tmp;
469
0
      mp_ptr *Ap, *Bp, A, B, T;
470
471
0
      k = mpn_fft_best_k (n, sqr);
472
0
      K2 = (mp_size_t) 1 << k;
473
0
      ASSERT_ALWAYS((n & (K2 - 1)) == 0);
474
0
      maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS;
475
0
      M2 = n * GMP_NUMB_BITS >> k;
476
0
      l = n >> k;
477
0
      Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK;
478
      /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/
479
0
      nprime2 = Nprime2 / GMP_NUMB_BITS;
480
481
      /* we should ensure that nprime2 is a multiple of the next K */
482
0
      if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
483
0
  {
484
0
    mp_size_t K3;
485
0
    for (;;)
486
0
      {
487
0
        K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr);
488
0
        if ((nprime2 & (K3 - 1)) == 0)
489
0
    break;
490
0
        nprime2 = (nprime2 + K3 - 1) & -K3;
491
0
        Nprime2 = nprime2 * GMP_LIMB_BITS;
492
        /* warning: since nprime2 changed, K3 may change too! */
493
0
      }
494
0
  }
495
0
      ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */
496
497
0
      Mp2 = Nprime2 >> k;
498
499
0
      Ap = TMP_BALLOC_MP_PTRS (K2);
500
0
      Bp = TMP_BALLOC_MP_PTRS (K2);
501
0
      A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k);
502
0
      T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1));
503
0
      B = A + ((nprime2 + 1) << k);
504
0
      fft_l = TMP_BALLOC_TYPE (k + 1, int *);
505
0
      tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
506
0
      for (i = 0; i <= k; i++)
507
0
  {
508
0
    fft_l[i] = tmp;
509
0
    tmp += (mp_size_t) 1 << i;
510
0
  }
511
512
0
      mpn_fft_initl (fft_l, k);
513
514
0
      TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n,
515
0
        n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
516
0
      for (i = 0; i < K; i++, ap++, bp++)
517
0
  {
518
0
    mp_limb_t cy;
519
0
    mpn_fft_normalize (*ap, n);
520
0
    if (!sqr)
521
0
      mpn_fft_normalize (*bp, n);
522
523
0
    mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
524
0
    if (!sqr)
525
0
      mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
526
527
0
    cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
528
0
             l, Mp2, fft_l, T, sqr);
529
0
    (*ap)[n] = cy;
530
0
  }
531
0
    }
532
0
  else
533
0
    {
534
0
      mp_ptr a, b, tp, tpn;
535
0
      mp_limb_t cc;
536
0
      mp_size_t n2 = 2 * n;
537
0
      tp = TMP_BALLOC_LIMBS (n2);
538
0
      tpn = tp + n;
539
0
      TRACE (printf ("  mpn_mul_n %ld of %ld limbs\n", K, n));
540
0
      for (i = 0; i < K; i++)
541
0
  {
542
0
    a = *ap++;
543
0
    b = *bp++;
544
0
    if (sqr)
545
0
      mpn_sqr (tp, a, n);
546
0
    else
547
0
      mpn_mul_n (tp, b, a, n);
548
0
    if (a[n] != 0)
549
0
      cc = mpn_add_n (tpn, tpn, b, n);
550
0
    else
551
0
      cc = 0;
552
0
    if (b[n] != 0)
553
0
      cc += mpn_add_n (tpn, tpn, a, n) + a[n];
554
0
    if (cc != 0)
555
0
      {
556
0
        cc = mpn_add_1 (tp, tp, n2, cc);
557
        /* If mpn_add_1 give a carry (cc != 0),
558
     the result (tp) is at most GMP_NUMB_MAX - 1,
559
     so the following addition can't overflow.
560
        */
561
0
        tp[0] += cc;
562
0
      }
563
0
    a[n] = mpn_sub_n (a, tp, tpn, n) && mpn_add_1 (a, a, n, CNST_LIMB(1));
564
0
  }
565
0
    }
566
0
  TMP_FREE;
567
0
}
568
569
570
/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
571
   output: K*A[0] K*A[K-1] ... K*A[1].
572
   Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1.
573
   This condition is also fulfilled at exit.
574
*/
575
static void
576
mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp)
577
0
{
578
0
  if (K == 2)
579
0
    {
580
0
      mp_limb_t cy;
581
#if HAVE_NATIVE_mpn_add_n_sub_n
582
      cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
583
#else
584
0
      MPN_COPY (tp, Ap[0], n + 1);
585
0
      mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
586
0
      cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1);
587
0
#endif
588
0
      if (Ap[0][n] > 1) /* can be 2 or 3 */
589
0
  Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
590
0
      if (cy) /* Ap[1][n] can be -1 or -2 */
591
0
  Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1);
592
0
    }
593
0
  else
594
0
    {
595
0
      mp_size_t j, K2 = K >> 1;
596
597
0
      mpn_fft_fftinv (Ap,      K2, 2 * omega, n, tp);
598
0
      mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp);
599
      /* A[j]     <- A[j] + omega^j A[j+K/2]
600
   A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
601
0
      for (j = 0; j < K2; j++, Ap++)
602
0
  {
603
    /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega)
604
       Ap[0]  <- Ap[0] + Ap[K2] * 2^(j * omega) */
605
0
    mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n);
606
#if HAVE_NATIVE_mpn_add_n_sub_n
607
    mpn_fft_add_sub_modF (Ap[0], Ap[K2], tp, n);
608
#else
609
0
    mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n);
610
0
    mpn_fft_add_modF (Ap[0],  Ap[0], tp, n);
611
0
#endif
612
0
  }
613
0
    }
614
0
}
615
616
617
/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
618
static void
619
mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n)
620
0
{
621
0
  mp_bitcnt_t i;
622
623
0
  ASSERT (r != a);
624
0
  i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k;
625
0
  mpn_fft_mul_2exp_modF (r, a, i, n);
626
  /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
627
  /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
628
0
  mpn_fft_normalize (r, n);
629
0
}
630
631
632
/* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n.
633
   Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1,
634
   then {rp,n}=0.
635
*/
636
static mp_size_t
637
mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an)
638
0
{
639
0
  mp_size_t l, m, rpn;
640
0
  mp_limb_t cc;
641
642
0
  ASSERT ((n <= an) && (an <= 3 * n));
643
0
  m = an - 2 * n;
644
0
  if (m > 0)
645
0
    {
646
0
      l = n;
647
      /* add {ap, m} and {ap+2n, m} in {rp, m} */
648
0
      cc = mpn_add_n (rp, ap, ap + 2 * n, m);
649
      /* copy {ap+m, n-m} to {rp+m, n-m} */
650
0
      rpn = mpn_add_1 (rp + m, ap + m, n - m, cc);
651
0
    }
652
0
  else
653
0
    {
654
0
      l = an - n; /* l <= n */
655
0
      MPN_COPY (rp, ap, n);
656
0
      rpn = 0;
657
0
    }
658
659
  /* remains to subtract {ap+n, l} from {rp, n+1} */
660
0
  cc = mpn_sub_n (rp, rp, ap + n, l);
661
0
  rpn -= mpn_sub_1 (rp + l, rp + l, n - l, cc);
662
0
  if (rpn < 0) /* necessarily rpn = -1 */
663
0
    rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1));
664
0
  return rpn;
665
0
}
666
667
/* store in A[0..nprime] the first M bits from {n, nl},
668
   in A[nprime+1..] the following M bits, ...
669
   Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS).
670
   T must have space for at least (nprime + 1) limbs.
671
   We must have nl <= 2*K*l.
672
*/
673
static void
674
mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime,
675
           mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp,
676
           mp_ptr T)
677
0
{
678
0
  mp_size_t i, j;
679
0
  mp_ptr tmp;
680
0
  mp_size_t Kl = K * l;
681
0
  TMP_DECL;
682
0
  TMP_MARK;
683
684
0
  if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */
685
0
    {
686
0
      mp_size_t dif = nl - Kl;
687
0
      mp_limb_signed_t cy;
688
689
0
      tmp = TMP_BALLOC_LIMBS(Kl + 1);
690
691
0
      if (dif > Kl)
692
0
  {
693
0
    int subp = 0;
694
695
0
    cy = mpn_sub_n (tmp, n, n + Kl, Kl);
696
0
    n += 2 * Kl;
697
0
    dif -= Kl;
698
699
    /* now dif > 0 */
700
0
    while (dif > Kl)
701
0
      {
702
0
        if (subp)
703
0
    cy += mpn_sub_n (tmp, tmp, n, Kl);
704
0
        else
705
0
    cy -= mpn_add_n (tmp, tmp, n, Kl);
706
0
        subp ^= 1;
707
0
        n += Kl;
708
0
        dif -= Kl;
709
0
      }
710
    /* now dif <= Kl */
711
0
    if (subp)
712
0
      cy += mpn_sub (tmp, tmp, Kl, n, dif);
713
0
    else
714
0
      cy -= mpn_add (tmp, tmp, Kl, n, dif);
715
0
    if (cy >= 0)
716
0
      cy = mpn_add_1 (tmp, tmp, Kl, cy);
717
0
    else
718
0
      cy = mpn_sub_1 (tmp, tmp, Kl, -cy);
719
0
  }
720
0
      else /* dif <= Kl, i.e. nl <= 2 * Kl */
721
0
  {
722
0
    cy = mpn_sub (tmp, n, Kl, n + Kl, dif);
723
0
    cy = mpn_add_1 (tmp, tmp, Kl, cy);
724
0
  }
725
0
      tmp[Kl] = cy;
726
0
      nl = Kl + 1;
727
0
      n = tmp;
728
0
    }
729
0
  for (i = 0; i < K; i++)
730
0
    {
731
0
      Ap[i] = A;
732
      /* store the next M bits of n into A[0..nprime] */
733
0
      if (nl > 0) /* nl is the number of remaining limbs */
734
0
  {
735
0
    j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */
736
0
    nl -= j;
737
0
    MPN_COPY (T, n, j);
738
0
    MPN_ZERO (T + j, nprime + 1 - j);
739
0
    n += l;
740
0
    mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime);
741
0
  }
742
0
      else
743
0
  MPN_ZERO (A, nprime + 1);
744
0
      A += nprime + 1;
745
0
    }
746
0
  ASSERT_ALWAYS (nl == 0);
747
0
  TMP_FREE;
748
0
}
749
750
/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
751
   op is pl limbs, its high bit is returned.
752
   One must have pl = mpn_fft_next_size (pl, k).
753
   T must have space for 2 * (nprime + 1) limbs.
754
*/
755
756
static mp_limb_t
757
mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
758
          mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
759
          mp_size_t nprime, mp_size_t l, mp_size_t Mp,
760
          int **fft_l, mp_ptr T, int sqr)
761
0
{
762
0
  mp_size_t K, i, pla, lo, sh, j;
763
0
  mp_ptr p;
764
0
  mp_limb_t cc;
765
766
0
  K = (mp_size_t) 1 << k;
767
768
  /* direct fft's */
769
0
  mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
770
0
  if (!sqr)
771
0
    mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
772
773
  /* term to term multiplications */
774
0
  mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
775
776
  /* inverse fft's */
777
0
  mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
778
779
  /* division of terms after inverse fft */
780
0
  Bp[0] = T + nprime + 1;
781
0
  mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime);
782
0
  for (i = 1; i < K; i++)
783
0
    {
784
0
      Bp[i] = Ap[i - 1];
785
0
      mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime);
786
0
    }
787
788
  /* addition of terms in result p */
789
0
  MPN_ZERO (T, nprime + 1);
790
0
  pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
791
0
  p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */
792
0
  MPN_ZERO (p, pla);
793
0
  cc = 0; /* will accumulate the (signed) carry at p[pla] */
794
0
  for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l)
795
0
    {
796
0
      mp_ptr n = p + sh;
797
798
0
      j = (K - i) & (K - 1);
799
800
0
      if (mpn_add_n (n, n, Bp[j], nprime + 1))
801
0
  cc += mpn_add_1 (n + nprime + 1, n + nprime + 1,
802
0
        pla - sh - nprime - 1, CNST_LIMB(1));
803
0
      T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */
804
0
      if (mpn_cmp (Bp[j], T, nprime + 1) > 0)
805
0
  { /* subtract 2^N'+1 */
806
0
    cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1));
807
0
    cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1));
808
0
  }
809
0
    }
810
0
  if (cc == -CNST_LIMB(1))
811
0
    {
812
0
      if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1))))
813
0
  {
814
    /* p[pla-pl]...p[pla-1] are all zero */
815
0
    mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1));
816
0
    mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1));
817
0
  }
818
0
    }
819
0
  else if (cc == 1)
820
0
    {
821
0
      if (pla >= 2 * pl)
822
0
  {
823
0
    while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc)))
824
0
      ;
825
0
  }
826
0
      else
827
0
  {
828
0
    cc = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, cc);
829
0
    ASSERT (cc == 0);
830
0
  }
831
0
    }
832
0
  else
833
0
    ASSERT (cc == 0);
834
835
  /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
836
     < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
837
     < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
838
0
  return mpn_fft_norm_modF (op, pl, p, pla);
839
0
}
840
841
/* return the lcm of a and 2^k */
842
static mp_bitcnt_t
843
mpn_mul_fft_lcm (mp_bitcnt_t a, int k)
844
0
{
845
0
  mp_bitcnt_t l = k;
846
847
0
  while (a % 2 == 0 && k > 0)
848
0
    {
849
0
      a >>= 1;
850
0
      k --;
851
0
    }
852
0
  return a << l;
853
0
}
854
855
856
mp_limb_t
857
mpn_mul_fft (mp_ptr op, mp_size_t pl,
858
       mp_srcptr n, mp_size_t nl,
859
       mp_srcptr m, mp_size_t ml,
860
       int k)
861
0
{
862
0
  int i;
863
0
  mp_size_t K, maxLK;
864
0
  mp_size_t N, Nprime, nprime, M, Mp, l;
865
0
  mp_ptr *Ap, *Bp, A, T, B;
866
0
  int **fft_l, *tmp;
867
0
  int sqr = (n == m && nl == ml);
868
0
  mp_limb_t h;
869
0
  TMP_DECL;
870
871
0
  TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k));
872
0
  ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl);
873
874
0
  TMP_MARK;
875
0
  N = pl * GMP_NUMB_BITS;
876
0
  fft_l = TMP_BALLOC_TYPE (k + 1, int *);
877
0
  tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
878
0
  for (i = 0; i <= k; i++)
879
0
    {
880
0
      fft_l[i] = tmp;
881
0
      tmp += (mp_size_t) 1 << i;
882
0
    }
883
884
0
  mpn_fft_initl (fft_l, k);
885
0
  K = (mp_size_t) 1 << k;
886
0
  M = N >> k; /* N = 2^k M */
887
0
  l = 1 + (M - 1) / GMP_NUMB_BITS;
888
0
  maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */
889
890
0
  Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
891
  /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */
892
0
  nprime = Nprime / GMP_NUMB_BITS;
893
0
  TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n",
894
0
     N, K, M, l, maxLK, Nprime, nprime));
895
  /* we should ensure that recursively, nprime is a multiple of the next K */
896
0
  if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
897
0
    {
898
0
      mp_size_t K2;
899
0
      for (;;)
900
0
  {
901
0
    K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr);
902
0
    if ((nprime & (K2 - 1)) == 0)
903
0
      break;
904
0
    nprime = (nprime + K2 - 1) & -K2;
905
0
    Nprime = nprime * GMP_LIMB_BITS;
906
    /* warning: since nprime changed, K2 may change too! */
907
0
  }
908
0
      TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime));
909
0
    }
910
0
  ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */
911
912
0
  T = TMP_BALLOC_LIMBS (2 * (nprime + 1));
913
0
  Mp = Nprime >> k;
914
915
0
  TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n",
916
0
    pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K);
917
0
   printf ("   temp space %ld\n", 2 * K * (nprime + 1)));
918
919
0
  A = TMP_BALLOC_LIMBS (K * (nprime + 1));
920
0
  Ap = TMP_BALLOC_MP_PTRS (K);
921
0
  mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T);
922
0
  if (sqr)
923
0
    {
924
0
      mp_size_t pla;
925
0
      pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
926
0
      B = TMP_BALLOC_LIMBS (pla);
927
0
      Bp = TMP_BALLOC_MP_PTRS (K);
928
0
    }
929
0
  else
930
0
    {
931
0
      B = TMP_BALLOC_LIMBS (K * (nprime + 1));
932
0
      Bp = TMP_BALLOC_MP_PTRS (K);
933
0
      mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T);
934
0
    }
935
0
  h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr);
936
937
0
  TMP_FREE;
938
0
  return h;
939
0
}
940
941
#if WANT_OLD_FFT_FULL
942
/* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */
943
void
944
mpn_mul_fft_full (mp_ptr op,
945
      mp_srcptr n, mp_size_t nl,
946
      mp_srcptr m, mp_size_t ml)
947
{
948
  mp_ptr pad_op;
949
  mp_size_t pl, pl2, pl3, l;
950
  mp_size_t cc, c2, oldcc;
951
  int k2, k3;
952
  int sqr = (n == m && nl == ml);
953
954
  pl = nl + ml; /* total number of limbs of the result */
955
956
  /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1.
957
     We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and
958
     pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2,
959
     and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) =
960
     (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j.
961
     We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1),
962
     which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */
963
964
  /*  ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */
965
966
  pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */
967
  do
968
    {
969
      pl2++;
970
      k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */
971
      pl2 = mpn_fft_next_size (pl2, k2);
972
      pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4,
973
          thus pl2 / 2 is exact */
974
      k3 = mpn_fft_best_k (pl3, sqr);
975
    }
976
  while (mpn_fft_next_size (pl3, k3) != pl3);
977
978
  TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n",
979
     nl, ml, pl2, pl3, k2));
980
981
  ASSERT_ALWAYS(pl3 <= pl);
982
  cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3);     /* mu */
983
  ASSERT(cc == 0);
984
  pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2);
985
  cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */
986
  cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2);    /* lambda - low(mu) */
987
  /* 0 <= cc <= 1 */
988
  ASSERT(0 <= cc && cc <= 1);
989
  l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */
990
  c2 = mpn_add_n (pad_op, pad_op, op + pl2, l);
991
  cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc;
992
  ASSERT(-1 <= cc && cc <= 1);
993
  if (cc < 0)
994
    cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
995
  ASSERT(0 <= cc && cc <= 1);
996
  /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */
997
  oldcc = cc;
998
#if HAVE_NATIVE_mpn_add_n_sub_n
999
  c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l);
1000
  cc += c2 >> 1; /* carry out from high <- low + high */
1001
  c2 = c2 & 1; /* borrow out from low <- low - high */
1002
#else
1003
  {
1004
    mp_ptr tmp;
1005
    TMP_DECL;
1006
1007
    TMP_MARK;
1008
    tmp = TMP_BALLOC_LIMBS (l);
1009
    MPN_COPY (tmp, pad_op, l);
1010
    c2 = mpn_sub_n (pad_op,      pad_op, pad_op + l, l);
1011
    cc += mpn_add_n (pad_op + l, tmp,    pad_op + l, l);
1012
    TMP_FREE;
1013
  }
1014
#endif
1015
  c2 += oldcc;
1016
  /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow
1017
     at pad_op + l, cc is the carry at pad_op + pl2 */
1018
  /* 0 <= cc <= 2 */
1019
  cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2);
1020
  /* -1 <= cc <= 2 */
1021
  if (cc > 0)
1022
    cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc);
1023
  /* now -1 <= cc <= 0 */
1024
  if (cc < 0)
1025
    cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
1026
  /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */
1027
  if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */
1028
    cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1));
1029
  /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry
1030
     out below */
1031
  mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */
1032
  if (cc) /* then cc=1 */
1033
    pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1);
1034
  /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS))
1035
     mod 2^(pl2*GMP_NUMB_BITS) + 1 */
1036
  c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */
1037
  /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */
1038
  MPN_COPY (op + pl3, pad_op, pl - pl3);
1039
  ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl);
1040
  __GMP_FREE_FUNC_LIMBS (pad_op, pl2);
1041
  /* since the final result has at most pl limbs, no carry out below */
1042
  mpn_add_1 (op + pl2, op + pl2, pl - pl2, (mp_limb_t) c2);
1043
}
1044
#endif